{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4943642475776152, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 3.232854127883911, "epoch": 0, "mean_token_accuracy": 0.6640726327896118, "num_tokens": 5393.0, "step": 0, "train/ce_loss": 2.2808852195739746 }, { "epoch": 0, "step": 0, "train/sim_loss": 1.015625 }, { "epoch": 0, "step": 0, "train/total_loss": 1.2437134981155396 }, { "entropy": 3.0902302265167236, "epoch": 9.887284951552304e-05, "mean_token_accuracy": 0.700964629650116, "num_tokens": 10878.0, "step": 1, "train/ce_loss": 1.4573792219161987 }, { "epoch": 9.887284951552304e-05, "step": 1, "train/sim_loss": 1.0078125 }, { "epoch": 9.887284951552304e-05, "step": 1, "train/total_loss": 1.153550386428833 }, { "entropy": 2.9726977348327637, "epoch": 0.00019774569903104609, "mean_token_accuracy": 0.7170838117599487, "num_tokens": 16370.0, "step": 2, "train/ce_loss": 1.625307321548462 }, { "epoch": 0.00019774569903104609, "step": 2, "train/sim_loss": 0.98828125 }, { "epoch": 0.00019774569903104609, "step": 2, "train/total_loss": 1.150812029838562 }, { "entropy": 3.1445326805114746, "epoch": 0.00029661854854656913, "mean_token_accuracy": 0.6801980137825012, "num_tokens": 22036.0, "step": 3, "train/ce_loss": 0.9109308123588562 }, { "epoch": 0.00029661854854656913, "step": 3, "train/sim_loss": 0.984375 }, { "epoch": 0.00029661854854656913, "step": 3, "train/total_loss": 1.0754680633544922 }, { "entropy": 3.254225254058838, "epoch": 0.00039549139806209217, "mean_token_accuracy": 0.6901408433914185, "num_tokens": 27600.0, "step": 4, "train/ce_loss": 1.572279691696167 }, { "epoch": 0.00039549139806209217, "step": 4, "train/sim_loss": 0.98828125 }, { "epoch": 0.00039549139806209217, "step": 4, "train/total_loss": 1.1455092430114746 }, { "entropy": 3.2603909969329834, "epoch": 0.0004943642475776152, "mean_token_accuracy": 0.7244348526000977, "num_tokens": 33291.0, "step": 5, "train/ce_loss": 0.8163019418716431 }, { "epoch": 0.0004943642475776152, "step": 5, "train/sim_loss": 0.97265625 }, { "epoch": 0.0004943642475776152, "step": 5, "train/total_loss": 1.0542864799499512 }, { "entropy": 3.453758716583252, "epoch": 0.0005932370970931383, "mean_token_accuracy": 0.7166666388511658, "num_tokens": 38881.0, "step": 6, "train/ce_loss": 1.2340606451034546 }, { "epoch": 0.0005932370970931383, "step": 6, "train/sim_loss": 0.984375 }, { "epoch": 0.0005932370970931383, "step": 6, "train/total_loss": 1.1077810525894165 }, { "entropy": 3.1682183742523193, "epoch": 0.0006921099466086612, "mean_token_accuracy": 0.6804878115653992, "num_tokens": 44335.0, "step": 7, "train/ce_loss": 1.2763797044754028 }, { "epoch": 0.0006921099466086612, "step": 7, "train/sim_loss": 0.921875 }, { "epoch": 0.0006921099466086612, "step": 7, "train/total_loss": 1.0495129823684692 }, { "entropy": 3.721844434738159, "epoch": 0.0007909827961241843, "mean_token_accuracy": 0.6925133466720581, "num_tokens": 49698.0, "step": 8, "train/ce_loss": 1.0785220861434937 }, { "epoch": 0.0007909827961241843, "step": 8, "train/sim_loss": 0.91796875 }, { "epoch": 0.0007909827961241843, "step": 8, "train/total_loss": 1.0258209705352783 }, { "entropy": 3.7266950607299805, "epoch": 0.0008898556456397073, "mean_token_accuracy": 0.6374502182006836, "num_tokens": 55074.0, "step": 9, "train/ce_loss": 1.3228412866592407 }, { "epoch": 0.0008898556456397073, "step": 9, "train/sim_loss": 0.87109375 }, { "epoch": 0.0008898556456397073, "step": 9, "train/total_loss": 1.003377914428711 }, { "entropy": 3.322518825531006, "epoch": 0.0009887284951552303, "mean_token_accuracy": 0.7234848737716675, "num_tokens": 60667.0, "step": 10, "train/ce_loss": 0.738264262676239 }, { "epoch": 0.0009887284951552303, "step": 10, "train/sim_loss": 0.8203125 }, { "epoch": 0.0009887284951552303, "step": 10, "train/total_loss": 0.8941389322280884 }, { "entropy": 3.5612030029296875, "epoch": 0.0010876013446707534, "mean_token_accuracy": 0.7497155666351318, "num_tokens": 66223.0, "step": 11, "train/ce_loss": 0.8377095460891724 }, { "epoch": 0.0010876013446707534, "step": 11, "train/sim_loss": 0.765625 }, { "epoch": 0.0010876013446707534, "step": 11, "train/total_loss": 0.8493959307670593 }, { "entropy": 3.775777578353882, "epoch": 0.0011864741941862765, "mean_token_accuracy": 0.7175792455673218, "num_tokens": 71524.0, "step": 12, "train/ce_loss": 1.3311091661453247 }, { "epoch": 0.0011864741941862765, "step": 12, "train/sim_loss": 0.71875 }, { "epoch": 0.0011864741941862765, "step": 12, "train/total_loss": 0.8518609404563904 }, { "entropy": 3.8853039741516113, "epoch": 0.0012853470437017994, "mean_token_accuracy": 0.716152012348175, "num_tokens": 76879.0, "step": 13, "train/ce_loss": 0.7692499756813049 }, { "epoch": 0.0012853470437017994, "step": 13, "train/sim_loss": 0.625 }, { "epoch": 0.0012853470437017994, "step": 13, "train/total_loss": 0.7019249796867371 }, { "entropy": 4.225286960601807, "epoch": 0.0013842198932173225, "mean_token_accuracy": 0.7326478362083435, "num_tokens": 82283.0, "step": 14, "train/ce_loss": 0.9510319828987122 }, { "epoch": 0.0013842198932173225, "step": 14, "train/sim_loss": 0.6171875 }, { "epoch": 0.0013842198932173225, "step": 14, "train/total_loss": 0.7122907042503357 }, { "entropy": 4.470114707946777, "epoch": 0.0014830927427328456, "mean_token_accuracy": 0.7123411893844604, "num_tokens": 88015.0, "step": 15, "train/ce_loss": 0.7988050580024719 }, { "epoch": 0.0014830927427328456, "step": 15, "train/sim_loss": 0.5390625 }, { "epoch": 0.0014830927427328456, "step": 15, "train/total_loss": 0.6189429759979248 }, { "entropy": 4.795734405517578, "epoch": 0.0015819655922483687, "mean_token_accuracy": 0.6881837844848633, "num_tokens": 93540.0, "step": 16, "train/ce_loss": 1.4160321950912476 }, { "epoch": 0.0015819655922483687, "step": 16, "train/sim_loss": 0.578125 }, { "epoch": 0.0015819655922483687, "step": 16, "train/total_loss": 0.7197282314300537 }, { "entropy": 4.660637378692627, "epoch": 0.0016808384417638916, "mean_token_accuracy": 0.7374213933944702, "num_tokens": 98804.0, "step": 17, "train/ce_loss": 1.0615134239196777 }, { "epoch": 0.0016808384417638916, "step": 17, "train/sim_loss": 0.48828125 }, { "epoch": 0.0016808384417638916, "step": 17, "train/total_loss": 0.5944325923919678 }, { "entropy": 4.343752861022949, "epoch": 0.0017797112912794147, "mean_token_accuracy": 0.7798408269882202, "num_tokens": 104649.0, "step": 18, "train/ce_loss": 0.8228769302368164 }, { "epoch": 0.0017797112912794147, "step": 18, "train/sim_loss": 0.49609375 }, { "epoch": 0.0017797112912794147, "step": 18, "train/total_loss": 0.5783814191818237 }, { "entropy": 5.168272972106934, "epoch": 0.0018785841407949378, "mean_token_accuracy": 0.6962190270423889, "num_tokens": 110054.0, "step": 19, "train/ce_loss": 0.9622052907943726 }, { "epoch": 0.0018785841407949378, "step": 19, "train/sim_loss": 0.41796875 }, { "epoch": 0.0018785841407949378, "step": 19, "train/total_loss": 0.5141893029212952 }, { "epoch": 0.0019774569903104606, "grad_norm": 1.7004696130752563, "learning_rate": 9.997774810858923e-06, "loss": 0.905, "step": 20 }, { "entropy": 4.543365478515625, "epoch": 0.0019774569903104606, "mean_token_accuracy": 0.7317554354667664, "num_tokens": 115702.0, "step": 20, "train/ce_loss": 0.48676741123199463 }, { "epoch": 0.0019774569903104606, "step": 20, "train/sim_loss": 0.41796875 }, { "epoch": 0.0019774569903104606, "step": 20, "train/total_loss": 0.4666454792022705 }, { "entropy": 4.754147529602051, "epoch": 0.002076329839825984, "mean_token_accuracy": 0.7524271607398987, "num_tokens": 121116.0, "step": 21, "train/ce_loss": 1.0305498838424683 }, { "epoch": 0.002076329839825984, "step": 21, "train/sim_loss": 0.4140625 }, { "epoch": 0.002076329839825984, "step": 21, "train/total_loss": 0.5171175003051758 }, { "entropy": 4.936140537261963, "epoch": 0.002175202689341507, "mean_token_accuracy": 0.6633166074752808, "num_tokens": 126477.0, "step": 22, "train/ce_loss": 0.9855369925498962 }, { "epoch": 0.002175202689341507, "step": 22, "train/sim_loss": 0.41015625 }, { "epoch": 0.002175202689341507, "step": 22, "train/total_loss": 0.5087099671363831 }, { "entropy": 4.547977924346924, "epoch": 0.0022740755388570297, "mean_token_accuracy": 0.726190447807312, "num_tokens": 132045.0, "step": 23, "train/ce_loss": 1.3456655740737915 }, { "epoch": 0.0022740755388570297, "step": 23, "train/sim_loss": 0.32421875 }, { "epoch": 0.0022740755388570297, "step": 23, "train/total_loss": 0.4587852954864502 }, { "entropy": 4.663654804229736, "epoch": 0.002372948388372553, "mean_token_accuracy": 0.7007575631141663, "num_tokens": 137465.0, "step": 24, "train/ce_loss": 0.7522186040878296 }, { "epoch": 0.002372948388372553, "step": 24, "train/sim_loss": 0.36328125 }, { "epoch": 0.002372948388372553, "step": 24, "train/total_loss": 0.43850311636924744 }, { "entropy": 5.121295928955078, "epoch": 0.002471821237888076, "mean_token_accuracy": 0.6640712022781372, "num_tokens": 143080.0, "step": 25, "train/ce_loss": 0.9343672394752502 }, { "epoch": 0.002471821237888076, "step": 25, "train/sim_loss": 0.35546875 }, { "epoch": 0.002471821237888076, "step": 25, "train/total_loss": 0.44890546798706055 }, { "entropy": 5.1091227531433105, "epoch": 0.002570694087403599, "mean_token_accuracy": 0.7408758997917175, "num_tokens": 148539.0, "step": 26, "train/ce_loss": 0.9452911019325256 }, { "epoch": 0.002570694087403599, "step": 26, "train/sim_loss": 0.29296875 }, { "epoch": 0.002570694087403599, "step": 26, "train/total_loss": 0.3874978721141815 }, { "entropy": 5.341334819793701, "epoch": 0.002669566936919122, "mean_token_accuracy": 0.7456258535385132, "num_tokens": 153897.0, "step": 27, "train/ce_loss": 0.9303287267684937 }, { "epoch": 0.002669566936919122, "step": 27, "train/sim_loss": 0.296875 }, { "epoch": 0.002669566936919122, "step": 27, "train/total_loss": 0.3899078667163849 }, { "entropy": 4.900827884674072, "epoch": 0.002768439786434645, "mean_token_accuracy": 0.7773972749710083, "num_tokens": 159607.0, "step": 28, "train/ce_loss": 0.5460175275802612 }, { "epoch": 0.002768439786434645, "step": 28, "train/sim_loss": 0.359375 }, { "epoch": 0.002768439786434645, "step": 28, "train/total_loss": 0.4139767587184906 }, { "entropy": 5.215836524963379, "epoch": 0.0028673126359501683, "mean_token_accuracy": 0.7470011115074158, "num_tokens": 165191.0, "step": 29, "train/ce_loss": 1.0395358800888062 }, { "epoch": 0.0028673126359501683, "step": 29, "train/sim_loss": 0.390625 }, { "epoch": 0.0028673126359501683, "step": 29, "train/total_loss": 0.49457859992980957 }, { "entropy": 5.204288482666016, "epoch": 0.002966185485465691, "mean_token_accuracy": 0.7364506125450134, "num_tokens": 170725.0, "step": 30, "train/ce_loss": 0.8963356614112854 }, { "epoch": 0.002966185485465691, "step": 30, "train/sim_loss": 0.37890625 }, { "epoch": 0.002966185485465691, "step": 30, "train/total_loss": 0.468539834022522 }, { "entropy": 5.56150484085083, "epoch": 0.003065058334981214, "mean_token_accuracy": 0.7144719958305359, "num_tokens": 176060.0, "step": 31, "train/ce_loss": 0.6763451099395752 }, { "epoch": 0.003065058334981214, "step": 31, "train/sim_loss": 0.3203125 }, { "epoch": 0.003065058334981214, "step": 31, "train/total_loss": 0.3879470229148865 }, { "entropy": 4.9518866539001465, "epoch": 0.0031639311844967374, "mean_token_accuracy": 0.7087156176567078, "num_tokens": 181871.0, "step": 32, "train/ce_loss": 0.4959302842617035 }, { "epoch": 0.0031639311844967374, "step": 32, "train/sim_loss": 0.33203125 }, { "epoch": 0.0031639311844967374, "step": 32, "train/total_loss": 0.3816242814064026 }, { "entropy": 5.0430498123168945, "epoch": 0.0032628040340122602, "mean_token_accuracy": 0.7303506731987, "num_tokens": 187315.0, "step": 33, "train/ce_loss": 0.8644870519638062 }, { "epoch": 0.0032628040340122602, "step": 33, "train/sim_loss": 0.26953125 }, { "epoch": 0.0032628040340122602, "step": 33, "train/total_loss": 0.35597994923591614 }, { "entropy": 5.6410627365112305, "epoch": 0.003361676883527783, "mean_token_accuracy": 0.7267573475837708, "num_tokens": 192805.0, "step": 34, "train/ce_loss": 0.8924073576927185 }, { "epoch": 0.003361676883527783, "step": 34, "train/sim_loss": 0.2734375 }, { "epoch": 0.003361676883527783, "step": 34, "train/total_loss": 0.3626782298088074 }, { "entropy": 5.750243186950684, "epoch": 0.0034605497330433064, "mean_token_accuracy": 0.6893453001976013, "num_tokens": 198215.0, "step": 35, "train/ce_loss": 1.1888625621795654 }, { "epoch": 0.0034605497330433064, "step": 35, "train/sim_loss": 0.29296875 }, { "epoch": 0.0034605497330433064, "step": 35, "train/total_loss": 0.411855012178421 }, { "entropy": 5.6438422203063965, "epoch": 0.0035594225825588293, "mean_token_accuracy": 0.749492883682251, "num_tokens": 203824.0, "step": 36, "train/ce_loss": 0.9351983666419983 }, { "epoch": 0.0035594225825588293, "step": 36, "train/sim_loss": 0.29296875 }, { "epoch": 0.0035594225825588293, "step": 36, "train/total_loss": 0.38648858666419983 }, { "entropy": 5.735121726989746, "epoch": 0.003658295432074352, "mean_token_accuracy": 0.7415204644203186, "num_tokens": 209254.0, "step": 37, "train/ce_loss": 0.8138383626937866 }, { "epoch": 0.003658295432074352, "step": 37, "train/sim_loss": 0.23828125 }, { "epoch": 0.003658295432074352, "step": 37, "train/total_loss": 0.3196650743484497 }, { "entropy": 5.749444961547852, "epoch": 0.0037571682815898755, "mean_token_accuracy": 0.7747126221656799, "num_tokens": 214652.0, "step": 38, "train/ce_loss": 0.6772257685661316 }, { "epoch": 0.0037571682815898755, "step": 38, "train/sim_loss": 0.33203125 }, { "epoch": 0.0037571682815898755, "step": 38, "train/total_loss": 0.3997538387775421 }, { "entropy": 6.239436149597168, "epoch": 0.0038560411311053984, "mean_token_accuracy": 0.6857476830482483, "num_tokens": 220085.0, "step": 39, "train/ce_loss": 1.5776121616363525 }, { "epoch": 0.0038560411311053984, "step": 39, "train/sim_loss": 0.265625 }, { "epoch": 0.0038560411311053984, "step": 39, "train/total_loss": 0.42338621616363525 }, { "epoch": 0.003954913980620921, "grad_norm": 1.1784355640411377, "learning_rate": 9.992829946100975e-06, "loss": 0.4278, "step": 40 }, { "entropy": 5.909239768981934, "epoch": 0.003954913980620921, "mean_token_accuracy": 0.7469586133956909, "num_tokens": 225571.0, "step": 40, "train/ce_loss": 0.6264683604240417 }, { "epoch": 0.003954913980620921, "step": 40, "train/sim_loss": 0.26171875 }, { "epoch": 0.003954913980620921, "step": 40, "train/total_loss": 0.3243655860424042 }, { "entropy": 6.159171104431152, "epoch": 0.004053786830136444, "mean_token_accuracy": 0.6941529512405396, "num_tokens": 230885.0, "step": 41, "train/ce_loss": 1.1460374593734741 }, { "epoch": 0.004053786830136444, "step": 41, "train/sim_loss": 0.31640625 }, { "epoch": 0.004053786830136444, "step": 41, "train/total_loss": 0.43101000785827637 }, { "entropy": 5.613990783691406, "epoch": 0.004152659679651968, "mean_token_accuracy": 0.7592592835426331, "num_tokens": 236456.0, "step": 42, "train/ce_loss": 0.8396984934806824 }, { "epoch": 0.004152659679651968, "step": 42, "train/sim_loss": 0.296875 }, { "epoch": 0.004152659679651968, "step": 42, "train/total_loss": 0.3808448612689972 }, { "entropy": 5.884822845458984, "epoch": 0.004251532529167491, "mean_token_accuracy": 0.7281045913696289, "num_tokens": 241827.0, "step": 43, "train/ce_loss": 0.7757571339607239 }, { "epoch": 0.004251532529167491, "step": 43, "train/sim_loss": 0.28125 }, { "epoch": 0.004251532529167491, "step": 43, "train/total_loss": 0.3588257133960724 }, { "entropy": 6.106474876403809, "epoch": 0.004350405378683014, "mean_token_accuracy": 0.7413793206214905, "num_tokens": 247191.0, "step": 44, "train/ce_loss": 0.7632415294647217 }, { "epoch": 0.004350405378683014, "step": 44, "train/sim_loss": 0.265625 }, { "epoch": 0.004350405378683014, "step": 44, "train/total_loss": 0.3419491648674011 }, { "entropy": 5.984395980834961, "epoch": 0.0044492782281985365, "mean_token_accuracy": 0.67739337682724, "num_tokens": 252998.0, "step": 45, "train/ce_loss": 0.4224933981895447 }, { "epoch": 0.0044492782281985365, "step": 45, "train/sim_loss": 0.203125 }, { "epoch": 0.0044492782281985365, "step": 45, "train/total_loss": 0.24537433683872223 }, { "entropy": 6.748654365539551, "epoch": 0.004548151077714059, "mean_token_accuracy": 0.7295514345169067, "num_tokens": 258340.0, "step": 46, "train/ce_loss": 0.7559360265731812 }, { "epoch": 0.004548151077714059, "step": 46, "train/sim_loss": 0.21875 }, { "epoch": 0.004548151077714059, "step": 46, "train/total_loss": 0.29434359073638916 }, { "entropy": 6.564530372619629, "epoch": 0.004647023927229583, "mean_token_accuracy": 0.702627956867218, "num_tokens": 263817.0, "step": 47, "train/ce_loss": 1.3968374729156494 }, { "epoch": 0.004647023927229583, "step": 47, "train/sim_loss": 0.2421875 }, { "epoch": 0.004647023927229583, "step": 47, "train/total_loss": 0.3818712532520294 }, { "entropy": 6.254408836364746, "epoch": 0.004745896776745106, "mean_token_accuracy": 0.7435897588729858, "num_tokens": 269149.0, "step": 48, "train/ce_loss": 1.10716712474823 }, { "epoch": 0.004745896776745106, "step": 48, "train/sim_loss": 0.21875 }, { "epoch": 0.004745896776745106, "step": 48, "train/total_loss": 0.32946670055389404 }, { "entropy": 6.526291847229004, "epoch": 0.004844769626260629, "mean_token_accuracy": 0.7911964058876038, "num_tokens": 274663.0, "step": 49, "train/ce_loss": 0.8739025592803955 }, { "epoch": 0.004844769626260629, "step": 49, "train/sim_loss": 0.23046875 }, { "epoch": 0.004844769626260629, "step": 49, "train/total_loss": 0.3178589940071106 }, { "entropy": 6.203254222869873, "epoch": 0.004943642475776152, "mean_token_accuracy": 0.7107081413269043, "num_tokens": 280395.0, "step": 50, "train/ce_loss": 0.5470512509346008 }, { "epoch": 0.004943642475776152, "step": 50, "train/sim_loss": 0.23828125 }, { "epoch": 0.004943642475776152, "step": 50, "train/total_loss": 0.29298636317253113 }, { "entropy": 6.308405876159668, "epoch": 0.005042515325291675, "mean_token_accuracy": 0.7507447600364685, "num_tokens": 286070.0, "step": 51, "train/ce_loss": 0.5457269549369812 }, { "epoch": 0.005042515325291675, "step": 51, "train/sim_loss": 0.23046875 }, { "epoch": 0.005042515325291675, "step": 51, "train/total_loss": 0.2850414514541626 }, { "entropy": 6.458256721496582, "epoch": 0.005141388174807198, "mean_token_accuracy": 0.7625979781150818, "num_tokens": 291594.0, "step": 52, "train/ce_loss": 0.93619304895401 }, { "epoch": 0.005141388174807198, "step": 52, "train/sim_loss": 0.2578125 }, { "epoch": 0.005141388174807198, "step": 52, "train/total_loss": 0.35143181681632996 }, { "entropy": 6.347724914550781, "epoch": 0.005240261024322721, "mean_token_accuracy": 0.6636971235275269, "num_tokens": 297131.0, "step": 53, "train/ce_loss": 1.0742437839508057 }, { "epoch": 0.005240261024322721, "step": 53, "train/sim_loss": 0.2421875 }, { "epoch": 0.005240261024322721, "step": 53, "train/total_loss": 0.34961187839508057 }, { "entropy": 6.71110725402832, "epoch": 0.005339133873838244, "mean_token_accuracy": 0.7581047415733337, "num_tokens": 302507.0, "step": 54, "train/ce_loss": 1.15473473072052 }, { "epoch": 0.005339133873838244, "step": 54, "train/sim_loss": 0.2578125 }, { "epoch": 0.005339133873838244, "step": 54, "train/total_loss": 0.3732859790325165 }, { "entropy": 6.332487106323242, "epoch": 0.005438006723353767, "mean_token_accuracy": 0.7537473440170288, "num_tokens": 308071.0, "step": 55, "train/ce_loss": 0.46033522486686707 }, { "epoch": 0.005438006723353767, "step": 55, "train/sim_loss": 0.25 }, { "epoch": 0.005438006723353767, "step": 55, "train/total_loss": 0.2960335314273834 }, { "entropy": 6.959592342376709, "epoch": 0.00553687957286929, "mean_token_accuracy": 0.75, "num_tokens": 313648.0, "step": 56, "train/ce_loss": 0.5643681883811951 }, { "epoch": 0.00553687957286929, "step": 56, "train/sim_loss": 0.33203125 }, { "epoch": 0.00553687957286929, "step": 56, "train/total_loss": 0.38846805691719055 }, { "entropy": 6.4645891189575195, "epoch": 0.005635752422384813, "mean_token_accuracy": 0.7302231192588806, "num_tokens": 319181.0, "step": 57, "train/ce_loss": 1.083419919013977 }, { "epoch": 0.005635752422384813, "step": 57, "train/sim_loss": 0.3125 }, { "epoch": 0.005635752422384813, "step": 57, "train/total_loss": 0.4208419919013977 }, { "entropy": 6.81294059753418, "epoch": 0.005734625271900337, "mean_token_accuracy": 0.7080131769180298, "num_tokens": 324685.0, "step": 58, "train/ce_loss": 1.0077672004699707 }, { "epoch": 0.005734625271900337, "step": 58, "train/sim_loss": 0.2265625 }, { "epoch": 0.005734625271900337, "step": 58, "train/total_loss": 0.327339231967926 }, { "entropy": 6.772646427154541, "epoch": 0.0058334981214158595, "mean_token_accuracy": 0.6804326176643372, "num_tokens": 330216.0, "step": 59, "train/ce_loss": 2.4057185649871826 }, { "epoch": 0.0058334981214158595, "step": 59, "train/sim_loss": 0.2265625 }, { "epoch": 0.0058334981214158595, "step": 59, "train/total_loss": 0.46713435649871826 }, { "epoch": 0.005932370970931382, "grad_norm": 1.039408564567566, "learning_rate": 9.987885081343026e-06, "loss": 0.3481, "step": 60 }, { "entropy": 6.743893146514893, "epoch": 0.005932370970931382, "mean_token_accuracy": 0.6648044586181641, "num_tokens": 335603.0, "step": 60, "train/ce_loss": 0.9384520053863525 }, { "epoch": 0.005932370970931382, "step": 60, "train/sim_loss": 0.21875 }, { "epoch": 0.005932370970931382, "step": 60, "train/total_loss": 0.3125951886177063 }, { "entropy": 6.990540504455566, "epoch": 0.006031243820446905, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 341015.0, "step": 61, "train/ce_loss": 0.66419517993927 }, { "epoch": 0.006031243820446905, "step": 61, "train/sim_loss": 0.21875 }, { "epoch": 0.006031243820446905, "step": 61, "train/total_loss": 0.2851695120334625 }, { "entropy": 7.224564552307129, "epoch": 0.006130116669962428, "mean_token_accuracy": 0.757617712020874, "num_tokens": 346302.0, "step": 62, "train/ce_loss": 1.1322176456451416 }, { "epoch": 0.006130116669962428, "step": 62, "train/sim_loss": 0.21875 }, { "epoch": 0.006130116669962428, "step": 62, "train/total_loss": 0.33197176456451416 }, { "entropy": 7.201101779937744, "epoch": 0.006228989519477951, "mean_token_accuracy": 0.7360000014305115, "num_tokens": 351791.0, "step": 63, "train/ce_loss": 0.8078625798225403 }, { "epoch": 0.006228989519477951, "step": 63, "train/sim_loss": 0.27734375 }, { "epoch": 0.006228989519477951, "step": 63, "train/total_loss": 0.35813000798225403 }, { "entropy": 7.103139400482178, "epoch": 0.006327862368993475, "mean_token_accuracy": 0.7447852492332458, "num_tokens": 357275.0, "step": 64, "train/ce_loss": 0.8672134280204773 }, { "epoch": 0.006327862368993475, "step": 64, "train/sim_loss": 0.20703125 }, { "epoch": 0.006327862368993475, "step": 64, "train/total_loss": 0.29375261068344116 }, { "entropy": 6.742211818695068, "epoch": 0.006426735218508998, "mean_token_accuracy": 0.7355864644050598, "num_tokens": 362898.0, "step": 65, "train/ce_loss": 1.351665735244751 }, { "epoch": 0.006426735218508998, "step": 65, "train/sim_loss": 0.20703125 }, { "epoch": 0.006426735218508998, "step": 65, "train/total_loss": 0.34219783544540405 }, { "entropy": 6.806244850158691, "epoch": 0.0065256080680245205, "mean_token_accuracy": 0.7531707286834717, "num_tokens": 368551.0, "step": 66, "train/ce_loss": 0.9516059756278992 }, { "epoch": 0.0065256080680245205, "step": 66, "train/sim_loss": 0.21484375 }, { "epoch": 0.0065256080680245205, "step": 66, "train/total_loss": 0.3100043535232544 }, { "entropy": 6.553163051605225, "epoch": 0.006624480917540043, "mean_token_accuracy": 0.7030043005943298, "num_tokens": 374346.0, "step": 67, "train/ce_loss": 1.0622292757034302 }, { "epoch": 0.006624480917540043, "step": 67, "train/sim_loss": 0.20703125 }, { "epoch": 0.006624480917540043, "step": 67, "train/total_loss": 0.313254177570343 }, { "entropy": 7.30144739151001, "epoch": 0.006723353767055566, "mean_token_accuracy": 0.7493975758552551, "num_tokens": 379763.0, "step": 68, "train/ce_loss": 0.8851649761199951 }, { "epoch": 0.006723353767055566, "step": 68, "train/sim_loss": 0.19921875 }, { "epoch": 0.006723353767055566, "step": 68, "train/total_loss": 0.287735253572464 }, { "entropy": 6.755577087402344, "epoch": 0.00682222661657109, "mean_token_accuracy": 0.6817777752876282, "num_tokens": 385440.0, "step": 69, "train/ce_loss": 0.6391826868057251 }, { "epoch": 0.00682222661657109, "step": 69, "train/sim_loss": 0.26953125 }, { "epoch": 0.00682222661657109, "step": 69, "train/total_loss": 0.33344951272010803 }, { "entropy": 7.322375297546387, "epoch": 0.006921099466086613, "mean_token_accuracy": 0.728199303150177, "num_tokens": 390923.0, "step": 70, "train/ce_loss": 0.5619321465492249 }, { "epoch": 0.006921099466086613, "step": 70, "train/sim_loss": 0.20703125 }, { "epoch": 0.006921099466086613, "step": 70, "train/total_loss": 0.26322445273399353 }, { "entropy": 7.227473258972168, "epoch": 0.007019972315602136, "mean_token_accuracy": 0.683698296546936, "num_tokens": 396332.0, "step": 71, "train/ce_loss": 1.6033495664596558 }, { "epoch": 0.007019972315602136, "step": 71, "train/sim_loss": 0.328125 }, { "epoch": 0.007019972315602136, "step": 71, "train/total_loss": 0.4884599447250366 }, { "entropy": 6.997678756713867, "epoch": 0.007118845165117659, "mean_token_accuracy": 0.7644927501678467, "num_tokens": 401775.0, "step": 72, "train/ce_loss": 0.655776858329773 }, { "epoch": 0.007118845165117659, "step": 72, "train/sim_loss": 0.19921875 }, { "epoch": 0.007118845165117659, "step": 72, "train/total_loss": 0.2647964358329773 }, { "entropy": 7.407192230224609, "epoch": 0.0072177180146331815, "mean_token_accuracy": 0.7244284152984619, "num_tokens": 407185.0, "step": 73, "train/ce_loss": 0.8598359823226929 }, { "epoch": 0.0072177180146331815, "step": 73, "train/sim_loss": 0.25 }, { "epoch": 0.0072177180146331815, "step": 73, "train/total_loss": 0.33598360419273376 }, { "entropy": 7.304167747497559, "epoch": 0.007316590864148704, "mean_token_accuracy": 0.7016706466674805, "num_tokens": 412643.0, "step": 74, "train/ce_loss": 1.223954439163208 }, { "epoch": 0.007316590864148704, "step": 74, "train/sim_loss": 0.3046875 }, { "epoch": 0.007316590864148704, "step": 74, "train/total_loss": 0.42708295583724976 }, { "entropy": 7.513787746429443, "epoch": 0.007415463713664228, "mean_token_accuracy": 0.7132169604301453, "num_tokens": 418078.0, "step": 75, "train/ce_loss": 1.1734284162521362 }, { "epoch": 0.007415463713664228, "step": 75, "train/sim_loss": 0.23046875 }, { "epoch": 0.007415463713664228, "step": 75, "train/total_loss": 0.34781157970428467 }, { "entropy": 7.257331371307373, "epoch": 0.007514336563179751, "mean_token_accuracy": 0.7441217303276062, "num_tokens": 423413.0, "step": 76, "train/ce_loss": 0.8698193430900574 }, { "epoch": 0.007514336563179751, "step": 76, "train/sim_loss": 0.1796875 }, { "epoch": 0.007514336563179751, "step": 76, "train/total_loss": 0.26666945219039917 }, { "entropy": 6.927927494049072, "epoch": 0.007613209412695274, "mean_token_accuracy": 0.7286282181739807, "num_tokens": 428981.0, "step": 77, "train/ce_loss": 0.837859570980072 }, { "epoch": 0.007613209412695274, "step": 77, "train/sim_loss": 0.1875 }, { "epoch": 0.007613209412695274, "step": 77, "train/total_loss": 0.2712859511375427 }, { "entropy": 7.4518818855285645, "epoch": 0.007712082262210797, "mean_token_accuracy": 0.7357991933822632, "num_tokens": 434277.0, "step": 78, "train/ce_loss": 0.8753558993339539 }, { "epoch": 0.007712082262210797, "step": 78, "train/sim_loss": 0.1875 }, { "epoch": 0.007712082262210797, "step": 78, "train/total_loss": 0.2750355899333954 }, { "entropy": 7.132779121398926, "epoch": 0.00781095511172632, "mean_token_accuracy": 0.7445972561836243, "num_tokens": 440005.0, "step": 79, "train/ce_loss": 0.8937827348709106 }, { "epoch": 0.00781095511172632, "step": 79, "train/sim_loss": 0.2109375 }, { "epoch": 0.00781095511172632, "step": 79, "train/total_loss": 0.3003157675266266 }, { "epoch": 0.007909827961241843, "grad_norm": 1.0260167121887207, "learning_rate": 9.982940216585078e-06, "loss": 0.3181, "step": 80 }, { "entropy": 6.950823783874512, "epoch": 0.007909827961241843, "mean_token_accuracy": 0.757717490196228, "num_tokens": 445714.0, "step": 80, "train/ce_loss": 0.7734050750732422 }, { "epoch": 0.007909827961241843, "step": 80, "train/sim_loss": 0.28125 }, { "epoch": 0.007909827961241843, "step": 80, "train/total_loss": 0.3585905134677887 }, { "entropy": 6.9274983406066895, "epoch": 0.008008700810757366, "mean_token_accuracy": 0.8101983070373535, "num_tokens": 451448.0, "step": 81, "train/ce_loss": 0.6553278565406799 }, { "epoch": 0.008008700810757366, "step": 81, "train/sim_loss": 0.1953125 }, { "epoch": 0.008008700810757366, "step": 81, "train/total_loss": 0.2608453035354614 }, { "entropy": 7.062063217163086, "epoch": 0.008107573660272888, "mean_token_accuracy": 0.8045186400413513, "num_tokens": 457312.0, "step": 82, "train/ce_loss": 0.615662693977356 }, { "epoch": 0.008107573660272888, "step": 82, "train/sim_loss": 0.1796875 }, { "epoch": 0.008107573660272888, "step": 82, "train/total_loss": 0.24125376343727112 }, { "entropy": 7.133183479309082, "epoch": 0.008206446509788412, "mean_token_accuracy": 0.771458089351654, "num_tokens": 462791.0, "step": 83, "train/ce_loss": 0.5283121466636658 }, { "epoch": 0.008206446509788412, "step": 83, "train/sim_loss": 0.18359375 }, { "epoch": 0.008206446509788412, "step": 83, "train/total_loss": 0.23642496764659882 }, { "entropy": 7.181549072265625, "epoch": 0.008305319359303936, "mean_token_accuracy": 0.7384066581726074, "num_tokens": 468289.0, "step": 84, "train/ce_loss": 0.7692922353744507 }, { "epoch": 0.008305319359303936, "step": 84, "train/sim_loss": 0.1484375 }, { "epoch": 0.008305319359303936, "step": 84, "train/total_loss": 0.2253667265176773 }, { "entropy": 6.921457290649414, "epoch": 0.008404192208819458, "mean_token_accuracy": 0.6997725367546082, "num_tokens": 474188.0, "step": 85, "train/ce_loss": 0.4455355405807495 }, { "epoch": 0.008404192208819458, "step": 85, "train/sim_loss": 0.13671875 }, { "epoch": 0.008404192208819458, "step": 85, "train/total_loss": 0.18127229809761047 }, { "entropy": 7.701231002807617, "epoch": 0.008503065058334982, "mean_token_accuracy": 0.712383508682251, "num_tokens": 479495.0, "step": 86, "train/ce_loss": 0.738291323184967 }, { "epoch": 0.008503065058334982, "step": 86, "train/sim_loss": 0.1796875 }, { "epoch": 0.008503065058334982, "step": 86, "train/total_loss": 0.25351664423942566 }, { "entropy": 7.724763870239258, "epoch": 0.008601937907850504, "mean_token_accuracy": 0.7400000095367432, "num_tokens": 485128.0, "step": 87, "train/ce_loss": 0.853188157081604 }, { "epoch": 0.008601937907850504, "step": 87, "train/sim_loss": 0.26171875 }, { "epoch": 0.008601937907850504, "step": 87, "train/total_loss": 0.34703755378723145 }, { "entropy": 7.717776298522949, "epoch": 0.008700810757366027, "mean_token_accuracy": 0.7341772317886353, "num_tokens": 490530.0, "step": 88, "train/ce_loss": 1.1054401397705078 }, { "epoch": 0.008700810757366027, "step": 88, "train/sim_loss": 0.1953125 }, { "epoch": 0.008700810757366027, "step": 88, "train/total_loss": 0.30585652589797974 }, { "entropy": 7.14111852645874, "epoch": 0.008799683606881551, "mean_token_accuracy": 0.7290192246437073, "num_tokens": 496098.0, "step": 89, "train/ce_loss": 0.6342229843139648 }, { "epoch": 0.008799683606881551, "step": 89, "train/sim_loss": 0.10546875 }, { "epoch": 0.008799683606881551, "step": 89, "train/total_loss": 0.168891042470932 }, { "entropy": 7.195494174957275, "epoch": 0.008898556456397073, "mean_token_accuracy": 0.6783284544944763, "num_tokens": 501653.0, "step": 90, "train/ce_loss": 0.700624942779541 }, { "epoch": 0.008898556456397073, "step": 90, "train/sim_loss": 0.2109375 }, { "epoch": 0.008898556456397073, "step": 90, "train/total_loss": 0.2809999883174896 }, { "entropy": 7.5853776931762695, "epoch": 0.008997429305912597, "mean_token_accuracy": 0.7553058862686157, "num_tokens": 507083.0, "step": 91, "train/ce_loss": 0.706805408000946 }, { "epoch": 0.008997429305912597, "step": 91, "train/sim_loss": 0.12890625 }, { "epoch": 0.008997429305912597, "step": 91, "train/total_loss": 0.19958679378032684 }, { "entropy": 7.3139448165893555, "epoch": 0.009096302155428119, "mean_token_accuracy": 0.7061556577682495, "num_tokens": 512561.0, "step": 92, "train/ce_loss": 0.8559697866439819 }, { "epoch": 0.009096302155428119, "step": 92, "train/sim_loss": 0.23828125 }, { "epoch": 0.009096302155428119, "step": 92, "train/total_loss": 0.3238782286643982 }, { "entropy": 7.599851608276367, "epoch": 0.009195175004943643, "mean_token_accuracy": 0.7286549806594849, "num_tokens": 517967.0, "step": 93, "train/ce_loss": 0.5614654421806335 }, { "epoch": 0.009195175004943643, "step": 93, "train/sim_loss": 0.1484375 }, { "epoch": 0.009195175004943643, "step": 93, "train/total_loss": 0.2045840471982956 }, { "entropy": 7.700191497802734, "epoch": 0.009294047854459166, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 523392.0, "step": 94, "train/ce_loss": 0.8384101986885071 }, { "epoch": 0.009294047854459166, "step": 94, "train/sim_loss": 0.18359375 }, { "epoch": 0.009294047854459166, "step": 94, "train/total_loss": 0.2674347758293152 }, { "entropy": 7.96530294418335, "epoch": 0.009392920703974688, "mean_token_accuracy": 0.7230559587478638, "num_tokens": 528722.0, "step": 95, "train/ce_loss": 0.839792013168335 }, { "epoch": 0.009392920703974688, "step": 95, "train/sim_loss": 0.17578125 }, { "epoch": 0.009392920703974688, "step": 95, "train/total_loss": 0.25976043939590454 }, { "entropy": 7.8956403732299805, "epoch": 0.009491793553490212, "mean_token_accuracy": 0.7346024513244629, "num_tokens": 534184.0, "step": 96, "train/ce_loss": 0.4742581248283386 }, { "epoch": 0.009491793553490212, "step": 96, "train/sim_loss": 0.17578125 }, { "epoch": 0.009491793553490212, "step": 96, "train/total_loss": 0.22320705652236938 }, { "entropy": 7.394715309143066, "epoch": 0.009590666403005734, "mean_token_accuracy": 0.769911527633667, "num_tokens": 539772.0, "step": 97, "train/ce_loss": 0.7879075407981873 }, { "epoch": 0.009590666403005734, "step": 97, "train/sim_loss": 0.24609375 }, { "epoch": 0.009590666403005734, "step": 97, "train/total_loss": 0.3248845040798187 }, { "entropy": 7.417305946350098, "epoch": 0.009689539252521258, "mean_token_accuracy": 0.724952757358551, "num_tokens": 545525.0, "step": 98, "train/ce_loss": 1.0810619592666626 }, { "epoch": 0.009689539252521258, "step": 98, "train/sim_loss": 0.2109375 }, { "epoch": 0.009689539252521258, "step": 98, "train/total_loss": 0.31904369592666626 }, { "entropy": 8.026399612426758, "epoch": 0.00978841210203678, "mean_token_accuracy": 0.7572115659713745, "num_tokens": 550948.0, "step": 99, "train/ce_loss": 0.8506556153297424 }, { "epoch": 0.00978841210203678, "step": 99, "train/sim_loss": 0.203125 }, { "epoch": 0.00978841210203678, "step": 99, "train/total_loss": 0.2881905734539032 }, { "epoch": 0.009887284951552304, "grad_norm": 0.9516990780830383, "learning_rate": 9.977995351827128e-06, "loss": 0.2863, "step": 100 }, { "entropy": 8.08210563659668, "epoch": 0.009887284951552304, "mean_token_accuracy": 0.693989098072052, "num_tokens": 556337.0, "step": 100, "train/ce_loss": 1.3508840799331665 }, { "epoch": 0.009887284951552304, "step": 100, "train/sim_loss": 0.17578125 }, { "epoch": 0.009887284951552304, "step": 100, "train/total_loss": 0.31086966395378113 }, { "entropy": 7.893654823303223, "epoch": 0.009986157801067827, "mean_token_accuracy": 0.7214912176132202, "num_tokens": 561836.0, "step": 101, "train/ce_loss": 0.500427782535553 }, { "epoch": 0.009986157801067827, "step": 101, "train/sim_loss": 0.1640625 }, { "epoch": 0.009986157801067827, "step": 101, "train/total_loss": 0.2141052782535553 }, { "entropy": 8.135427474975586, "epoch": 0.01008503065058335, "mean_token_accuracy": 0.7607594728469849, "num_tokens": 567227.0, "step": 102, "train/ce_loss": 0.6604991555213928 }, { "epoch": 0.01008503065058335, "step": 102, "train/sim_loss": 0.18359375 }, { "epoch": 0.01008503065058335, "step": 102, "train/total_loss": 0.24964366853237152 }, { "entropy": 7.778552055358887, "epoch": 0.010183903500098873, "mean_token_accuracy": 0.7262905240058899, "num_tokens": 572649.0, "step": 103, "train/ce_loss": 1.3892452716827393 }, { "epoch": 0.010183903500098873, "step": 103, "train/sim_loss": 0.22265625 }, { "epoch": 0.010183903500098873, "step": 103, "train/total_loss": 0.3615807890892029 }, { "entropy": 7.775136947631836, "epoch": 0.010282776349614395, "mean_token_accuracy": 0.720168948173523, "num_tokens": 578161.0, "step": 104, "train/ce_loss": 1.0051565170288086 }, { "epoch": 0.010282776349614395, "step": 104, "train/sim_loss": 0.13671875 }, { "epoch": 0.010282776349614395, "step": 104, "train/total_loss": 0.23723441362380981 }, { "entropy": 7.681502342224121, "epoch": 0.010381649199129919, "mean_token_accuracy": 0.7805714011192322, "num_tokens": 583675.0, "step": 105, "train/ce_loss": 0.9624525904655457 }, { "epoch": 0.010381649199129919, "step": 105, "train/sim_loss": 0.1796875 }, { "epoch": 0.010381649199129919, "step": 105, "train/total_loss": 0.27593275904655457 }, { "entropy": 7.381743907928467, "epoch": 0.010480522048645443, "mean_token_accuracy": 0.7068965435028076, "num_tokens": 589240.0, "step": 106, "train/ce_loss": 0.6358188986778259 }, { "epoch": 0.010480522048645443, "step": 106, "train/sim_loss": 0.1640625 }, { "epoch": 0.010480522048645443, "step": 106, "train/total_loss": 0.22764438390731812 }, { "entropy": 7.868211269378662, "epoch": 0.010579394898160965, "mean_token_accuracy": 0.6998706459999084, "num_tokens": 594633.0, "step": 107, "train/ce_loss": 1.1002508401870728 }, { "epoch": 0.010579394898160965, "step": 107, "train/sim_loss": 0.15625 }, { "epoch": 0.010579394898160965, "step": 107, "train/total_loss": 0.2662750780582428 }, { "entropy": 7.6260223388671875, "epoch": 0.010678267747676488, "mean_token_accuracy": 0.7510460019111633, "num_tokens": 600263.0, "step": 108, "train/ce_loss": 0.7622029185295105 }, { "epoch": 0.010678267747676488, "step": 108, "train/sim_loss": 0.109375 }, { "epoch": 0.010678267747676488, "step": 108, "train/total_loss": 0.18559530377388 }, { "entropy": 8.227834701538086, "epoch": 0.01077714059719201, "mean_token_accuracy": 0.7171717286109924, "num_tokens": 605556.0, "step": 109, "train/ce_loss": 0.905960738658905 }, { "epoch": 0.01077714059719201, "step": 109, "train/sim_loss": 0.1875 }, { "epoch": 0.01077714059719201, "step": 109, "train/total_loss": 0.278096079826355 }, { "entropy": 8.20416259765625, "epoch": 0.010876013446707534, "mean_token_accuracy": 0.7243995070457458, "num_tokens": 610953.0, "step": 110, "train/ce_loss": 1.3408927917480469 }, { "epoch": 0.010876013446707534, "step": 110, "train/sim_loss": 0.23828125 }, { "epoch": 0.010876013446707534, "step": 110, "train/total_loss": 0.37237054109573364 }, { "entropy": 8.091245651245117, "epoch": 0.010974886296223058, "mean_token_accuracy": 0.7617371082305908, "num_tokens": 616455.0, "step": 111, "train/ce_loss": 1.06613290309906 }, { "epoch": 0.010974886296223058, "step": 111, "train/sim_loss": 0.2109375 }, { "epoch": 0.010974886296223058, "step": 111, "train/total_loss": 0.31755077838897705 }, { "entropy": 7.71589994430542, "epoch": 0.01107375914573858, "mean_token_accuracy": 0.7695390582084656, "num_tokens": 622010.0, "step": 112, "train/ce_loss": 0.9249752759933472 }, { "epoch": 0.01107375914573858, "step": 112, "train/sim_loss": 0.11328125 }, { "epoch": 0.01107375914573858, "step": 112, "train/total_loss": 0.20577877759933472 }, { "entropy": 8.15939712524414, "epoch": 0.011172631995254104, "mean_token_accuracy": 0.7125890851020813, "num_tokens": 627429.0, "step": 113, "train/ce_loss": 0.7607758045196533 }, { "epoch": 0.011172631995254104, "step": 113, "train/sim_loss": 0.16796875 }, { "epoch": 0.011172631995254104, "step": 113, "train/total_loss": 0.24404633045196533 }, { "entropy": 7.443978309631348, "epoch": 0.011271504844769626, "mean_token_accuracy": 0.7631862163543701, "num_tokens": 633013.0, "step": 114, "train/ce_loss": 0.8538786768913269 }, { "epoch": 0.011271504844769626, "step": 114, "train/sim_loss": 0.1796875 }, { "epoch": 0.011271504844769626, "step": 114, "train/total_loss": 0.2650753855705261 }, { "entropy": 8.388126373291016, "epoch": 0.01137037769428515, "mean_token_accuracy": 0.703377366065979, "num_tokens": 638437.0, "step": 115, "train/ce_loss": 0.7227177619934082 }, { "epoch": 0.01137037769428515, "step": 115, "train/sim_loss": 0.14453125 }, { "epoch": 0.01137037769428515, "step": 115, "train/total_loss": 0.21680302917957306 }, { "entropy": 8.157764434814453, "epoch": 0.011469250543800673, "mean_token_accuracy": 0.6994885206222534, "num_tokens": 643859.0, "step": 116, "train/ce_loss": 1.3380248546600342 }, { "epoch": 0.011469250543800673, "step": 116, "train/sim_loss": 0.265625 }, { "epoch": 0.011469250543800673, "step": 116, "train/total_loss": 0.39942747354507446 }, { "entropy": 7.718086242675781, "epoch": 0.011568123393316195, "mean_token_accuracy": 0.7408105731010437, "num_tokens": 649470.0, "step": 117, "train/ce_loss": 0.9935168027877808 }, { "epoch": 0.011568123393316195, "step": 117, "train/sim_loss": 0.1484375 }, { "epoch": 0.011568123393316195, "step": 117, "train/total_loss": 0.2477891743183136 }, { "entropy": 8.220015525817871, "epoch": 0.011666996242831719, "mean_token_accuracy": 0.7015834450721741, "num_tokens": 654904.0, "step": 118, "train/ce_loss": 0.7607983350753784 }, { "epoch": 0.011666996242831719, "step": 118, "train/sim_loss": 0.16796875 }, { "epoch": 0.011666996242831719, "step": 118, "train/total_loss": 0.2440485954284668 }, { "entropy": 7.8679914474487305, "epoch": 0.011765869092347241, "mean_token_accuracy": 0.7121374607086182, "num_tokens": 660491.0, "step": 119, "train/ce_loss": 0.7249107956886292 }, { "epoch": 0.011765869092347241, "step": 119, "train/sim_loss": 0.15234375 }, { "epoch": 0.011765869092347241, "step": 119, "train/total_loss": 0.22483482956886292 }, { "epoch": 0.011864741941862765, "grad_norm": 0.8319772481918335, "learning_rate": 9.973050487069179e-06, "loss": 0.2758, "step": 120 }, { "entropy": 7.699333667755127, "epoch": 0.011864741941862765, "mean_token_accuracy": 0.719235360622406, "num_tokens": 665965.0, "step": 120, "train/ce_loss": 0.5454022288322449 }, { "epoch": 0.011864741941862765, "step": 120, "train/sim_loss": 0.1875 }, { "epoch": 0.011864741941862765, "step": 120, "train/total_loss": 0.24204021692276 }, { "entropy": 7.9337477684021, "epoch": 0.011963614791378287, "mean_token_accuracy": 0.7081151604652405, "num_tokens": 671319.0, "step": 121, "train/ce_loss": 1.4078081846237183 }, { "epoch": 0.011963614791378287, "step": 121, "train/sim_loss": 0.359375 }, { "epoch": 0.011963614791378287, "step": 121, "train/total_loss": 0.5001558065414429 }, { "entropy": 8.065155029296875, "epoch": 0.01206248764089381, "mean_token_accuracy": 0.7171464562416077, "num_tokens": 676770.0, "step": 122, "train/ce_loss": 0.7923189997673035 }, { "epoch": 0.01206248764089381, "step": 122, "train/sim_loss": 0.15234375 }, { "epoch": 0.01206248764089381, "step": 122, "train/total_loss": 0.23157565295696259 }, { "entropy": 8.039446830749512, "epoch": 0.012161360490409334, "mean_token_accuracy": 0.7864450216293335, "num_tokens": 682113.0, "step": 123, "train/ce_loss": 0.8386374115943909 }, { "epoch": 0.012161360490409334, "step": 123, "train/sim_loss": 0.15625 }, { "epoch": 0.012161360490409334, "step": 123, "train/total_loss": 0.2401137351989746 }, { "entropy": 8.231321334838867, "epoch": 0.012260233339924856, "mean_token_accuracy": 0.6794871687889099, "num_tokens": 687721.0, "step": 124, "train/ce_loss": 0.6296879053115845 }, { "epoch": 0.012260233339924856, "step": 124, "train/sim_loss": 0.19921875 }, { "epoch": 0.012260233339924856, "step": 124, "train/total_loss": 0.26218754053115845 }, { "entropy": 8.031255722045898, "epoch": 0.01235910618944038, "mean_token_accuracy": 0.784795343875885, "num_tokens": 693189.0, "step": 125, "train/ce_loss": 0.6077536344528198 }, { "epoch": 0.01235910618944038, "step": 125, "train/sim_loss": 0.2421875 }, { "epoch": 0.01235910618944038, "step": 125, "train/total_loss": 0.30296286940574646 }, { "entropy": 7.960617542266846, "epoch": 0.012457979038955902, "mean_token_accuracy": 0.7593896985054016, "num_tokens": 698677.0, "step": 126, "train/ce_loss": 1.0030608177185059 }, { "epoch": 0.012457979038955902, "step": 126, "train/sim_loss": 0.234375 }, { "epoch": 0.012457979038955902, "step": 126, "train/total_loss": 0.33468109369277954 }, { "entropy": 8.2476167678833, "epoch": 0.012556851888471426, "mean_token_accuracy": 0.7514451146125793, "num_tokens": 703968.0, "step": 127, "train/ce_loss": 0.6719730496406555 }, { "epoch": 0.012556851888471426, "step": 127, "train/sim_loss": 0.2109375 }, { "epoch": 0.012556851888471426, "step": 127, "train/total_loss": 0.278134822845459 }, { "entropy": 8.047690391540527, "epoch": 0.01265572473798695, "mean_token_accuracy": 0.7151819467544556, "num_tokens": 709333.0, "step": 128, "train/ce_loss": 0.8312079310417175 }, { "epoch": 0.01265572473798695, "step": 128, "train/sim_loss": 0.22265625 }, { "epoch": 0.01265572473798695, "step": 128, "train/total_loss": 0.30577704310417175 }, { "entropy": 7.591558933258057, "epoch": 0.012754597587502471, "mean_token_accuracy": 0.6928508281707764, "num_tokens": 715022.0, "step": 129, "train/ce_loss": 1.1940933465957642 }, { "epoch": 0.012754597587502471, "step": 129, "train/sim_loss": 0.20703125 }, { "epoch": 0.012754597587502471, "step": 129, "train/total_loss": 0.32644057273864746 }, { "entropy": 8.406843185424805, "epoch": 0.012853470437017995, "mean_token_accuracy": 0.7337031960487366, "num_tokens": 720288.0, "step": 130, "train/ce_loss": 0.9681195020675659 }, { "epoch": 0.012853470437017995, "step": 130, "train/sim_loss": 0.109375 }, { "epoch": 0.012853470437017995, "step": 130, "train/total_loss": 0.2061869502067566 }, { "entropy": 7.980602741241455, "epoch": 0.012952343286533517, "mean_token_accuracy": 0.7446327805519104, "num_tokens": 725826.0, "step": 131, "train/ce_loss": 1.047386646270752 }, { "epoch": 0.012952343286533517, "step": 131, "train/sim_loss": 0.09375 }, { "epoch": 0.012952343286533517, "step": 131, "train/total_loss": 0.19848866760730743 }, { "entropy": 8.287382125854492, "epoch": 0.013051216136049041, "mean_token_accuracy": 0.6990172266960144, "num_tokens": 731221.0, "step": 132, "train/ce_loss": 1.5049716234207153 }, { "epoch": 0.013051216136049041, "step": 132, "train/sim_loss": 0.26171875 }, { "epoch": 0.013051216136049041, "step": 132, "train/total_loss": 0.412215918302536 }, { "entropy": 8.240259170532227, "epoch": 0.013150088985564565, "mean_token_accuracy": 0.7431421279907227, "num_tokens": 736633.0, "step": 133, "train/ce_loss": 0.9649224281311035 }, { "epoch": 0.013150088985564565, "step": 133, "train/sim_loss": 0.24609375 }, { "epoch": 0.013150088985564565, "step": 133, "train/total_loss": 0.3425859808921814 }, { "entropy": 8.168668746948242, "epoch": 0.013248961835080087, "mean_token_accuracy": 0.7428896427154541, "num_tokens": 742154.0, "step": 134, "train/ce_loss": 0.6955239176750183 }, { "epoch": 0.013248961835080087, "step": 134, "train/sim_loss": 0.15625 }, { "epoch": 0.013248961835080087, "step": 134, "train/total_loss": 0.22580239176750183 }, { "entropy": 8.09598445892334, "epoch": 0.01334783468459561, "mean_token_accuracy": 0.6994219422340393, "num_tokens": 747618.0, "step": 135, "train/ce_loss": 1.403621792793274 }, { "epoch": 0.01334783468459561, "step": 135, "train/sim_loss": 0.23046875 }, { "epoch": 0.01334783468459561, "step": 135, "train/total_loss": 0.3708309531211853 }, { "entropy": 8.16989803314209, "epoch": 0.013446707534111133, "mean_token_accuracy": 0.7592191100120544, "num_tokens": 753173.0, "step": 136, "train/ce_loss": 1.0911941528320312 }, { "epoch": 0.013446707534111133, "step": 136, "train/sim_loss": 0.171875 }, { "epoch": 0.013446707534111133, "step": 136, "train/total_loss": 0.2809944152832031 }, { "entropy": 8.079985618591309, "epoch": 0.013545580383626656, "mean_token_accuracy": 0.723696231842041, "num_tokens": 758850.0, "step": 137, "train/ce_loss": 1.841156244277954 }, { "epoch": 0.013545580383626656, "step": 137, "train/sim_loss": 0.14453125 }, { "epoch": 0.013545580383626656, "step": 137, "train/total_loss": 0.3286468982696533 }, { "entropy": 7.97689151763916, "epoch": 0.01364445323314218, "mean_token_accuracy": 0.736637532711029, "num_tokens": 764589.0, "step": 138, "train/ce_loss": 0.7574009895324707 }, { "epoch": 0.01364445323314218, "step": 138, "train/sim_loss": 0.20703125 }, { "epoch": 0.01364445323314218, "step": 138, "train/total_loss": 0.28277134895324707 }, { "entropy": 8.219090461730957, "epoch": 0.013743326082657702, "mean_token_accuracy": 0.7156069278717041, "num_tokens": 770009.0, "step": 139, "train/ce_loss": 0.754828929901123 }, { "epoch": 0.013743326082657702, "step": 139, "train/sim_loss": 0.11328125 }, { "epoch": 0.013743326082657702, "step": 139, "train/total_loss": 0.18876415491104126 }, { "epoch": 0.013842198932173226, "grad_norm": 0.8286811113357544, "learning_rate": 9.968105622311231e-06, "loss": 0.2758, "step": 140 }, { "entropy": 8.636162757873535, "epoch": 0.013842198932173226, "mean_token_accuracy": 0.7294617295265198, "num_tokens": 775364.0, "step": 140, "train/ce_loss": 1.1851195096969604 }, { "epoch": 0.013842198932173226, "step": 140, "train/sim_loss": 0.18359375 }, { "epoch": 0.013842198932173226, "step": 140, "train/total_loss": 0.30210569500923157 }, { "entropy": 8.151058197021484, "epoch": 0.013941071781688748, "mean_token_accuracy": 0.7199519276618958, "num_tokens": 780796.0, "step": 141, "train/ce_loss": 1.1012438535690308 }, { "epoch": 0.013941071781688748, "step": 141, "train/sim_loss": 0.140625 }, { "epoch": 0.013941071781688748, "step": 141, "train/total_loss": 0.2507493793964386 }, { "entropy": 8.496475219726562, "epoch": 0.014039944631204272, "mean_token_accuracy": 0.7623049020767212, "num_tokens": 786220.0, "step": 142, "train/ce_loss": 0.9634978771209717 }, { "epoch": 0.014039944631204272, "step": 142, "train/sim_loss": 0.22265625 }, { "epoch": 0.014039944631204272, "step": 142, "train/total_loss": 0.3190060257911682 }, { "entropy": 8.574563980102539, "epoch": 0.014138817480719794, "mean_token_accuracy": 0.7102425694465637, "num_tokens": 791496.0, "step": 143, "train/ce_loss": 0.7832018136978149 }, { "epoch": 0.014138817480719794, "step": 143, "train/sim_loss": 0.08984375 }, { "epoch": 0.014138817480719794, "step": 143, "train/total_loss": 0.16816392540931702 }, { "entropy": 7.899078845977783, "epoch": 0.014237690330235317, "mean_token_accuracy": 0.7223628759384155, "num_tokens": 797269.0, "step": 144, "train/ce_loss": 0.5067986249923706 }, { "epoch": 0.014237690330235317, "step": 144, "train/sim_loss": 0.2109375 }, { "epoch": 0.014237690330235317, "step": 144, "train/total_loss": 0.26161736249923706 }, { "entropy": 8.215500831604004, "epoch": 0.014336563179750841, "mean_token_accuracy": 0.7281659245491028, "num_tokens": 802818.0, "step": 145, "train/ce_loss": 0.9188609719276428 }, { "epoch": 0.014336563179750841, "step": 145, "train/sim_loss": 0.23828125 }, { "epoch": 0.014336563179750841, "step": 145, "train/total_loss": 0.33016735315322876 }, { "entropy": 8.074363708496094, "epoch": 0.014435436029266363, "mean_token_accuracy": 0.7164705991744995, "num_tokens": 808307.0, "step": 146, "train/ce_loss": 0.7893985509872437 }, { "epoch": 0.014435436029266363, "step": 146, "train/sim_loss": 0.1796875 }, { "epoch": 0.014435436029266363, "step": 146, "train/total_loss": 0.25862735509872437 }, { "entropy": 8.229816436767578, "epoch": 0.014534308878781887, "mean_token_accuracy": 0.7182940244674683, "num_tokens": 813832.0, "step": 147, "train/ce_loss": 0.924876868724823 }, { "epoch": 0.014534308878781887, "step": 147, "train/sim_loss": 0.203125 }, { "epoch": 0.014534308878781887, "step": 147, "train/total_loss": 0.2956126928329468 }, { "entropy": 8.376535415649414, "epoch": 0.014633181728297409, "mean_token_accuracy": 0.721455454826355, "num_tokens": 819267.0, "step": 148, "train/ce_loss": 1.0831589698791504 }, { "epoch": 0.014633181728297409, "step": 148, "train/sim_loss": 0.21484375 }, { "epoch": 0.014633181728297409, "step": 148, "train/total_loss": 0.3231596350669861 }, { "entropy": 8.5894193649292, "epoch": 0.014732054577812933, "mean_token_accuracy": 0.7018633484840393, "num_tokens": 824621.0, "step": 149, "train/ce_loss": 0.9788232445716858 }, { "epoch": 0.014732054577812933, "step": 149, "train/sim_loss": 0.078125 }, { "epoch": 0.014732054577812933, "step": 149, "train/total_loss": 0.17600733041763306 }, { "entropy": 8.585893630981445, "epoch": 0.014830927427328456, "mean_token_accuracy": 0.731836199760437, "num_tokens": 829996.0, "step": 150, "train/ce_loss": 0.5619584918022156 }, { "epoch": 0.014830927427328456, "step": 150, "train/sim_loss": 0.1796875 }, { "epoch": 0.014830927427328456, "step": 150, "train/total_loss": 0.23588335514068604 }, { "entropy": 8.44482421875, "epoch": 0.014929800276843978, "mean_token_accuracy": 0.7113401889801025, "num_tokens": 835466.0, "step": 151, "train/ce_loss": 0.6860079169273376 }, { "epoch": 0.014929800276843978, "step": 151, "train/sim_loss": 0.12890625 }, { "epoch": 0.014929800276843978, "step": 151, "train/total_loss": 0.19750705361366272 }, { "entropy": 8.131193161010742, "epoch": 0.015028673126359502, "mean_token_accuracy": 0.7104557752609253, "num_tokens": 841178.0, "step": 152, "train/ce_loss": 0.7275317311286926 }, { "epoch": 0.015028673126359502, "step": 152, "train/sim_loss": 0.2109375 }, { "epoch": 0.015028673126359502, "step": 152, "train/total_loss": 0.2836906909942627 }, { "entropy": 8.213645935058594, "epoch": 0.015127545975875024, "mean_token_accuracy": 0.7119205594062805, "num_tokens": 846760.0, "step": 153, "train/ce_loss": 1.052865982055664 }, { "epoch": 0.015127545975875024, "step": 153, "train/sim_loss": 0.21484375 }, { "epoch": 0.015127545975875024, "step": 153, "train/total_loss": 0.3201303482055664 }, { "entropy": 8.2730712890625, "epoch": 0.015226418825390548, "mean_token_accuracy": 0.7265446186065674, "num_tokens": 852238.0, "step": 154, "train/ce_loss": 0.7438105940818787 }, { "epoch": 0.015226418825390548, "step": 154, "train/sim_loss": 0.1875 }, { "epoch": 0.015226418825390548, "step": 154, "train/total_loss": 0.2618810534477234 }, { "entropy": 8.228289604187012, "epoch": 0.015325291674906072, "mean_token_accuracy": 0.7530054450035095, "num_tokens": 857748.0, "step": 155, "train/ce_loss": 0.8237298727035522 }, { "epoch": 0.015325291674906072, "step": 155, "train/sim_loss": 0.2109375 }, { "epoch": 0.015325291674906072, "step": 155, "train/total_loss": 0.2933104932308197 }, { "entropy": 8.393631935119629, "epoch": 0.015424164524421594, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 863138.0, "step": 156, "train/ce_loss": 1.361872911453247 }, { "epoch": 0.015424164524421594, "step": 156, "train/sim_loss": 0.16796875 }, { "epoch": 0.015424164524421594, "step": 156, "train/total_loss": 0.3041560649871826 }, { "entropy": 8.427817344665527, "epoch": 0.015523037373937117, "mean_token_accuracy": 0.7146702408790588, "num_tokens": 868499.0, "step": 157, "train/ce_loss": 0.9314271807670593 }, { "epoch": 0.015523037373937117, "step": 157, "train/sim_loss": 0.12109375 }, { "epoch": 0.015523037373937117, "step": 157, "train/total_loss": 0.21423646807670593 }, { "entropy": 8.203285217285156, "epoch": 0.01562191022345264, "mean_token_accuracy": 0.7361809015274048, "num_tokens": 873895.0, "step": 158, "train/ce_loss": 1.3417840003967285 }, { "epoch": 0.01562191022345264, "step": 158, "train/sim_loss": 0.22265625 }, { "epoch": 0.01562191022345264, "step": 158, "train/total_loss": 0.35683465003967285 }, { "entropy": 8.180510520935059, "epoch": 0.01572078307296816, "mean_token_accuracy": 0.6904969215393066, "num_tokens": 879637.0, "step": 159, "train/ce_loss": 0.5038241744041443 }, { "epoch": 0.01572078307296816, "step": 159, "train/sim_loss": 0.13671875 }, { "epoch": 0.01572078307296816, "step": 159, "train/total_loss": 0.18710117042064667 }, { "epoch": 0.015819655922483685, "grad_norm": 0.9185183048248291, "learning_rate": 9.963160757553282e-06, "loss": 0.2709, "step": 160 }, { "entropy": 8.547701835632324, "epoch": 0.015819655922483685, "mean_token_accuracy": 0.7168596982955933, "num_tokens": 885099.0, "step": 160, "train/ce_loss": 0.37520235776901245 }, { "epoch": 0.015819655922483685, "step": 160, "train/sim_loss": 0.15234375 }, { "epoch": 0.015819655922483685, "step": 160, "train/total_loss": 0.18986397981643677 }, { "entropy": 7.887790203094482, "epoch": 0.01591852877199921, "mean_token_accuracy": 0.707539975643158, "num_tokens": 891013.0, "step": 161, "train/ce_loss": 0.8556743264198303 }, { "epoch": 0.01591852877199921, "step": 161, "train/sim_loss": 0.07421875 }, { "epoch": 0.01591852877199921, "step": 161, "train/total_loss": 0.159786194562912 }, { "entropy": 8.371549606323242, "epoch": 0.016017401621514733, "mean_token_accuracy": 0.7684674859046936, "num_tokens": 896492.0, "step": 162, "train/ce_loss": 0.687208890914917 }, { "epoch": 0.016017401621514733, "step": 162, "train/sim_loss": 0.1328125 }, { "epoch": 0.016017401621514733, "step": 162, "train/total_loss": 0.20153339207172394 }, { "entropy": 8.445982933044434, "epoch": 0.016116274471030256, "mean_token_accuracy": 0.7370370626449585, "num_tokens": 901896.0, "step": 163, "train/ce_loss": 1.0037447214126587 }, { "epoch": 0.016116274471030256, "step": 163, "train/sim_loss": 0.1484375 }, { "epoch": 0.016116274471030256, "step": 163, "train/total_loss": 0.2488119751214981 }, { "entropy": 8.313480377197266, "epoch": 0.016215147320545777, "mean_token_accuracy": 0.7587672472000122, "num_tokens": 907477.0, "step": 164, "train/ce_loss": 0.7500120401382446 }, { "epoch": 0.016215147320545777, "step": 164, "train/sim_loss": 0.17578125 }, { "epoch": 0.016215147320545777, "step": 164, "train/total_loss": 0.25078245997428894 }, { "entropy": 8.55392074584961, "epoch": 0.0163140201700613, "mean_token_accuracy": 0.660804033279419, "num_tokens": 912923.0, "step": 165, "train/ce_loss": 1.6969555616378784 }, { "epoch": 0.0163140201700613, "step": 165, "train/sim_loss": 0.23046875 }, { "epoch": 0.0163140201700613, "step": 165, "train/total_loss": 0.40016430616378784 }, { "entropy": 8.072317123413086, "epoch": 0.016412893019576824, "mean_token_accuracy": 0.6775209903717041, "num_tokens": 918556.0, "step": 166, "train/ce_loss": 1.4354581832885742 }, { "epoch": 0.016412893019576824, "step": 166, "train/sim_loss": 0.23828125 }, { "epoch": 0.016412893019576824, "step": 166, "train/total_loss": 0.38182705640792847 }, { "entropy": 8.796947479248047, "epoch": 0.016511765869092348, "mean_token_accuracy": 0.7236503958702087, "num_tokens": 923879.0, "step": 167, "train/ce_loss": 0.8442161679267883 }, { "epoch": 0.016511765869092348, "step": 167, "train/sim_loss": 0.12890625 }, { "epoch": 0.016511765869092348, "step": 167, "train/total_loss": 0.21332786977291107 }, { "entropy": 8.566291809082031, "epoch": 0.01661063871860787, "mean_token_accuracy": 0.7338792085647583, "num_tokens": 929781.0, "step": 168, "train/ce_loss": 0.9451162219047546 }, { "epoch": 0.01661063871860787, "step": 168, "train/sim_loss": 0.16796875 }, { "epoch": 0.01661063871860787, "step": 168, "train/total_loss": 0.26248037815093994 }, { "entropy": 8.302209854125977, "epoch": 0.016709511568123392, "mean_token_accuracy": 0.7618069648742676, "num_tokens": 935293.0, "step": 169, "train/ce_loss": 0.7426667213439941 }, { "epoch": 0.016709511568123392, "step": 169, "train/sim_loss": 0.21875 }, { "epoch": 0.016709511568123392, "step": 169, "train/total_loss": 0.2930166721343994 }, { "entropy": 8.378729820251465, "epoch": 0.016808384417638916, "mean_token_accuracy": 0.7412177920341492, "num_tokens": 940737.0, "step": 170, "train/ce_loss": 0.9783827662467957 }, { "epoch": 0.016808384417638916, "step": 170, "train/sim_loss": 0.1953125 }, { "epoch": 0.016808384417638916, "step": 170, "train/total_loss": 0.29315078258514404 }, { "entropy": 8.711372375488281, "epoch": 0.01690725726715444, "mean_token_accuracy": 0.7235293984413147, "num_tokens": 946032.0, "step": 171, "train/ce_loss": 0.7599998116493225 }, { "epoch": 0.01690725726715444, "step": 171, "train/sim_loss": 0.15234375 }, { "epoch": 0.01690725726715444, "step": 171, "train/total_loss": 0.22834372520446777 }, { "entropy": 8.618997573852539, "epoch": 0.017006130116669963, "mean_token_accuracy": 0.7113259434700012, "num_tokens": 951404.0, "step": 172, "train/ce_loss": 1.117348551750183 }, { "epoch": 0.017006130116669963, "step": 172, "train/sim_loss": 0.12890625 }, { "epoch": 0.017006130116669963, "step": 172, "train/total_loss": 0.24064111709594727 }, { "entropy": 8.312094688415527, "epoch": 0.017105002966185487, "mean_token_accuracy": 0.7028347849845886, "num_tokens": 957047.0, "step": 173, "train/ce_loss": 1.3785039186477661 }, { "epoch": 0.017105002966185487, "step": 173, "train/sim_loss": 0.2265625 }, { "epoch": 0.017105002966185487, "step": 173, "train/total_loss": 0.36441290378570557 }, { "entropy": 8.664949417114258, "epoch": 0.017203875815701007, "mean_token_accuracy": 0.7399165630340576, "num_tokens": 962336.0, "step": 174, "train/ce_loss": 0.9396028518676758 }, { "epoch": 0.017203875815701007, "step": 174, "train/sim_loss": 0.21484375 }, { "epoch": 0.017203875815701007, "step": 174, "train/total_loss": 0.3088040351867676 }, { "entropy": 8.358953475952148, "epoch": 0.01730274866521653, "mean_token_accuracy": 0.773797333240509, "num_tokens": 967964.0, "step": 175, "train/ce_loss": 0.6981449723243713 }, { "epoch": 0.01730274866521653, "step": 175, "train/sim_loss": 0.1171875 }, { "epoch": 0.01730274866521653, "step": 175, "train/total_loss": 0.1870020031929016 }, { "entropy": 8.358871459960938, "epoch": 0.017401621514732055, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 973533.0, "step": 176, "train/ce_loss": 0.7937830686569214 }, { "epoch": 0.017401621514732055, "step": 176, "train/sim_loss": 0.15625 }, { "epoch": 0.017401621514732055, "step": 176, "train/total_loss": 0.23562830686569214 }, { "entropy": 8.56052017211914, "epoch": 0.01750049436424758, "mean_token_accuracy": 0.729863703250885, "num_tokens": 978985.0, "step": 177, "train/ce_loss": 0.9147941470146179 }, { "epoch": 0.01750049436424758, "step": 177, "train/sim_loss": 0.11328125 }, { "epoch": 0.01750049436424758, "step": 177, "train/total_loss": 0.20476067066192627 }, { "entropy": 8.466004371643066, "epoch": 0.017599367213763102, "mean_token_accuracy": 0.745334804058075, "num_tokens": 984537.0, "step": 178, "train/ce_loss": 0.8773728013038635 }, { "epoch": 0.017599367213763102, "step": 178, "train/sim_loss": 0.1328125 }, { "epoch": 0.017599367213763102, "step": 178, "train/total_loss": 0.2205497920513153 }, { "entropy": 8.841814041137695, "epoch": 0.017698240063278622, "mean_token_accuracy": 0.7211837768554688, "num_tokens": 989796.0, "step": 179, "train/ce_loss": 0.6652518510818481 }, { "epoch": 0.017698240063278622, "step": 179, "train/sim_loss": 0.1484375 }, { "epoch": 0.017698240063278622, "step": 179, "train/total_loss": 0.2149626910686493 }, { "epoch": 0.017797112912794146, "grad_norm": 0.947462260723114, "learning_rate": 9.958215892795334e-06, "loss": 0.2562, "step": 180 }, { "entropy": 8.484806060791016, "epoch": 0.017797112912794146, "mean_token_accuracy": 0.7085650563240051, "num_tokens": 995287.0, "step": 180, "train/ce_loss": 1.0508121252059937 }, { "epoch": 0.017797112912794146, "step": 180, "train/sim_loss": 0.16796875 }, { "epoch": 0.017797112912794146, "step": 180, "train/total_loss": 0.2730499505996704 }, { "entropy": 8.663198471069336, "epoch": 0.01789598576230967, "mean_token_accuracy": 0.7325905561447144, "num_tokens": 1000663.0, "step": 181, "train/ce_loss": 0.8166200518608093 }, { "epoch": 0.01789598576230967, "step": 181, "train/sim_loss": 0.1484375 }, { "epoch": 0.01789598576230967, "step": 181, "train/total_loss": 0.23009949922561646 }, { "entropy": 8.165563583374023, "epoch": 0.017994858611825194, "mean_token_accuracy": 0.737313449382782, "num_tokens": 1006315.0, "step": 182, "train/ce_loss": 0.8787663578987122 }, { "epoch": 0.017994858611825194, "step": 182, "train/sim_loss": 0.13671875 }, { "epoch": 0.017994858611825194, "step": 182, "train/total_loss": 0.22459539771080017 }, { "entropy": 8.261810302734375, "epoch": 0.018093731461340717, "mean_token_accuracy": 0.706135630607605, "num_tokens": 1011883.0, "step": 183, "train/ce_loss": 1.440053105354309 }, { "epoch": 0.018093731461340717, "step": 183, "train/sim_loss": 0.22265625 }, { "epoch": 0.018093731461340717, "step": 183, "train/total_loss": 0.36666154861450195 }, { "entropy": 8.160216331481934, "epoch": 0.018192604310856238, "mean_token_accuracy": 0.7305936217308044, "num_tokens": 1017448.0, "step": 184, "train/ce_loss": 0.7345646619796753 }, { "epoch": 0.018192604310856238, "step": 184, "train/sim_loss": 0.19921875 }, { "epoch": 0.018192604310856238, "step": 184, "train/total_loss": 0.27267521619796753 }, { "entropy": 8.457632064819336, "epoch": 0.01829147716037176, "mean_token_accuracy": 0.7301587462425232, "num_tokens": 1022915.0, "step": 185, "train/ce_loss": 0.9440494179725647 }, { "epoch": 0.01829147716037176, "step": 185, "train/sim_loss": 0.18359375 }, { "epoch": 0.01829147716037176, "step": 185, "train/total_loss": 0.277998685836792 }, { "entropy": 8.316122055053711, "epoch": 0.018390350009887285, "mean_token_accuracy": 0.6663685441017151, "num_tokens": 1028641.0, "step": 186, "train/ce_loss": 1.2856032848358154 }, { "epoch": 0.018390350009887285, "step": 186, "train/sim_loss": 0.2109375 }, { "epoch": 0.018390350009887285, "step": 186, "train/total_loss": 0.339497834444046 }, { "entropy": 8.323981285095215, "epoch": 0.01848922285940281, "mean_token_accuracy": 0.6982671022415161, "num_tokens": 1034263.0, "step": 187, "train/ce_loss": 0.5654125213623047 }, { "epoch": 0.01848922285940281, "step": 187, "train/sim_loss": 0.140625 }, { "epoch": 0.01848922285940281, "step": 187, "train/total_loss": 0.19716624915599823 }, { "entropy": 8.40280532836914, "epoch": 0.018588095708918333, "mean_token_accuracy": 0.7852004170417786, "num_tokens": 1039899.0, "step": 188, "train/ce_loss": 0.5610334873199463 }, { "epoch": 0.018588095708918333, "step": 188, "train/sim_loss": 0.10546875 }, { "epoch": 0.018588095708918333, "step": 188, "train/total_loss": 0.16157209873199463 }, { "entropy": 8.275581359863281, "epoch": 0.018686968558433853, "mean_token_accuracy": 0.7175509929656982, "num_tokens": 1045822.0, "step": 189, "train/ce_loss": 0.48965680599212646 }, { "epoch": 0.018686968558433853, "step": 189, "train/sim_loss": 0.078125 }, { "epoch": 0.018686968558433853, "step": 189, "train/total_loss": 0.1270906776189804 }, { "entropy": 8.546506881713867, "epoch": 0.018785841407949377, "mean_token_accuracy": 0.7886363863945007, "num_tokens": 1051370.0, "step": 190, "train/ce_loss": 0.6778501868247986 }, { "epoch": 0.018785841407949377, "step": 190, "train/sim_loss": 0.08984375 }, { "epoch": 0.018785841407949377, "step": 190, "train/total_loss": 0.15762877464294434 }, { "entropy": 8.522708892822266, "epoch": 0.0188847142574649, "mean_token_accuracy": 0.7315598726272583, "num_tokens": 1056748.0, "step": 191, "train/ce_loss": 1.069607138633728 }, { "epoch": 0.0188847142574649, "step": 191, "train/sim_loss": 0.1171875 }, { "epoch": 0.0188847142574649, "step": 191, "train/total_loss": 0.2241482138633728 }, { "entropy": 8.610021591186523, "epoch": 0.018983587106980424, "mean_token_accuracy": 0.7424242496490479, "num_tokens": 1062161.0, "step": 192, "train/ce_loss": 1.1777212619781494 }, { "epoch": 0.018983587106980424, "step": 192, "train/sim_loss": 0.0859375 }, { "epoch": 0.018983587106980424, "step": 192, "train/total_loss": 0.20370963215827942 }, { "entropy": 8.526687622070312, "epoch": 0.019082459956495944, "mean_token_accuracy": 0.7011615633964539, "num_tokens": 1067760.0, "step": 193, "train/ce_loss": 0.8989193439483643 }, { "epoch": 0.019082459956495944, "step": 193, "train/sim_loss": 0.1640625 }, { "epoch": 0.019082459956495944, "step": 193, "train/total_loss": 0.2539544403553009 }, { "entropy": 8.407150268554688, "epoch": 0.019181332806011468, "mean_token_accuracy": 0.7451456189155579, "num_tokens": 1073245.0, "step": 194, "train/ce_loss": 1.1419775485992432 }, { "epoch": 0.019181332806011468, "step": 194, "train/sim_loss": 0.1953125 }, { "epoch": 0.019181332806011468, "step": 194, "train/total_loss": 0.3095102608203888 }, { "entropy": 8.893587112426758, "epoch": 0.019280205655526992, "mean_token_accuracy": 0.7132169604301453, "num_tokens": 1078588.0, "step": 195, "train/ce_loss": 0.8272491693496704 }, { "epoch": 0.019280205655526992, "step": 195, "train/sim_loss": 0.19140625 }, { "epoch": 0.019280205655526992, "step": 195, "train/total_loss": 0.274131178855896 }, { "entropy": 8.38595199584961, "epoch": 0.019379078505042516, "mean_token_accuracy": 0.7442622780799866, "num_tokens": 1084144.0, "step": 196, "train/ce_loss": 0.9956916570663452 }, { "epoch": 0.019379078505042516, "step": 196, "train/sim_loss": 0.078125 }, { "epoch": 0.019379078505042516, "step": 196, "train/total_loss": 0.177694171667099 }, { "entropy": 8.703729629516602, "epoch": 0.01947795135455804, "mean_token_accuracy": 0.7481752038002014, "num_tokens": 1089520.0, "step": 197, "train/ce_loss": 0.900157630443573 }, { "epoch": 0.01947795135455804, "step": 197, "train/sim_loss": 0.23828125 }, { "epoch": 0.01947795135455804, "step": 197, "train/total_loss": 0.3282970190048218 }, { "entropy": 8.81466293334961, "epoch": 0.01957682420407356, "mean_token_accuracy": 0.7281045913696289, "num_tokens": 1094911.0, "step": 198, "train/ce_loss": 1.273821234703064 }, { "epoch": 0.01957682420407356, "step": 198, "train/sim_loss": 0.22265625 }, { "epoch": 0.01957682420407356, "step": 198, "train/total_loss": 0.3500383794307709 }, { "entropy": 8.47381591796875, "epoch": 0.019675697053589083, "mean_token_accuracy": 0.7758793830871582, "num_tokens": 1100588.0, "step": 199, "train/ce_loss": 0.7371880412101746 }, { "epoch": 0.019675697053589083, "step": 199, "train/sim_loss": 0.13671875 }, { "epoch": 0.019675697053589083, "step": 199, "train/total_loss": 0.2104375660419464 }, { "epoch": 0.019774569903104607, "grad_norm": 1.1520451307296753, "learning_rate": 9.953271028037384e-06, "loss": 0.2566, "step": 200 }, { "entropy": 8.838947296142578, "epoch": 0.019774569903104607, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 1106038.0, "step": 200, "train/ce_loss": 0.5501865148544312 }, { "epoch": 0.019774569903104607, "step": 200, "train/sim_loss": 0.25390625 }, { "epoch": 0.019774569903104607, "step": 200, "train/total_loss": 0.30892491340637207 }, { "entropy": 8.921218872070312, "epoch": 0.01987344275262013, "mean_token_accuracy": 0.7191011309623718, "num_tokens": 1111457.0, "step": 201, "train/ce_loss": 1.5149867534637451 }, { "epoch": 0.01987344275262013, "step": 201, "train/sim_loss": 0.12109375 }, { "epoch": 0.01987344275262013, "step": 201, "train/total_loss": 0.2725924253463745 }, { "entropy": 8.845229148864746, "epoch": 0.019972315602135655, "mean_token_accuracy": 0.7512820363044739, "num_tokens": 1116892.0, "step": 202, "train/ce_loss": 0.9696965217590332 }, { "epoch": 0.019972315602135655, "step": 202, "train/sim_loss": 0.171875 }, { "epoch": 0.019972315602135655, "step": 202, "train/total_loss": 0.2688446640968323 }, { "entropy": 8.638507843017578, "epoch": 0.020071188451651175, "mean_token_accuracy": 0.7100330591201782, "num_tokens": 1122405.0, "step": 203, "train/ce_loss": 0.4799044132232666 }, { "epoch": 0.020071188451651175, "step": 203, "train/sim_loss": 0.07421875 }, { "epoch": 0.020071188451651175, "step": 203, "train/total_loss": 0.12220919132232666 }, { "entropy": 8.81063461303711, "epoch": 0.0201700613011667, "mean_token_accuracy": 0.7320261597633362, "num_tokens": 1127822.0, "step": 204, "train/ce_loss": 0.49506309628486633 }, { "epoch": 0.0201700613011667, "step": 204, "train/sim_loss": 0.140625 }, { "epoch": 0.0201700613011667, "step": 204, "train/total_loss": 0.1901313066482544 }, { "entropy": 8.629983901977539, "epoch": 0.020268934150682223, "mean_token_accuracy": 0.7653276920318604, "num_tokens": 1133422.0, "step": 205, "train/ce_loss": 0.7373542189598083 }, { "epoch": 0.020268934150682223, "step": 205, "train/sim_loss": 0.16796875 }, { "epoch": 0.020268934150682223, "step": 205, "train/total_loss": 0.24170416593551636 }, { "entropy": 8.84347152709961, "epoch": 0.020367807000197746, "mean_token_accuracy": 0.7269585132598877, "num_tokens": 1138938.0, "step": 206, "train/ce_loss": 1.0288078784942627 }, { "epoch": 0.020367807000197746, "step": 206, "train/sim_loss": 0.1953125 }, { "epoch": 0.020367807000197746, "step": 206, "train/total_loss": 0.2981932759284973 }, { "entropy": 8.928255081176758, "epoch": 0.02046667984971327, "mean_token_accuracy": 0.6924999952316284, "num_tokens": 1144330.0, "step": 207, "train/ce_loss": 0.9214930534362793 }, { "epoch": 0.02046667984971327, "step": 207, "train/sim_loss": 0.125 }, { "epoch": 0.02046667984971327, "step": 207, "train/total_loss": 0.21714931726455688 }, { "entropy": 8.603888511657715, "epoch": 0.02056555269922879, "mean_token_accuracy": 0.7044146060943604, "num_tokens": 1149993.0, "step": 208, "train/ce_loss": 1.0280470848083496 }, { "epoch": 0.02056555269922879, "step": 208, "train/sim_loss": 0.1328125 }, { "epoch": 0.02056555269922879, "step": 208, "train/total_loss": 0.23561722040176392 }, { "entropy": 8.971364974975586, "epoch": 0.020664425548744314, "mean_token_accuracy": 0.7233115434646606, "num_tokens": 1155546.0, "step": 209, "train/ce_loss": 0.7916296720504761 }, { "epoch": 0.020664425548744314, "step": 209, "train/sim_loss": 0.125 }, { "epoch": 0.020664425548744314, "step": 209, "train/total_loss": 0.20416297018527985 }, { "entropy": 8.776910781860352, "epoch": 0.020763298398259838, "mean_token_accuracy": 0.761403501033783, "num_tokens": 1160973.0, "step": 210, "train/ce_loss": 0.7931421995162964 }, { "epoch": 0.020763298398259838, "step": 210, "train/sim_loss": 0.1328125 }, { "epoch": 0.020763298398259838, "step": 210, "train/total_loss": 0.2121267318725586 }, { "entropy": 8.56975269317627, "epoch": 0.02086217124777536, "mean_token_accuracy": 0.7221684455871582, "num_tokens": 1166623.0, "step": 211, "train/ce_loss": 1.2307994365692139 }, { "epoch": 0.02086217124777536, "step": 211, "train/sim_loss": 0.16015625 }, { "epoch": 0.02086217124777536, "step": 211, "train/total_loss": 0.28323620557785034 }, { "entropy": 8.886282920837402, "epoch": 0.020961044097290885, "mean_token_accuracy": 0.7496823668479919, "num_tokens": 1172059.0, "step": 212, "train/ce_loss": 0.8315246105194092 }, { "epoch": 0.020961044097290885, "step": 212, "train/sim_loss": 0.1875 }, { "epoch": 0.020961044097290885, "step": 212, "train/total_loss": 0.2706524729728699 }, { "entropy": 9.013860702514648, "epoch": 0.021059916946806406, "mean_token_accuracy": 0.7180722951889038, "num_tokens": 1177586.0, "step": 213, "train/ce_loss": 0.9549789428710938 }, { "epoch": 0.021059916946806406, "step": 213, "train/sim_loss": 0.19140625 }, { "epoch": 0.021059916946806406, "step": 213, "train/total_loss": 0.28690415620803833 }, { "entropy": 8.841848373413086, "epoch": 0.02115878979632193, "mean_token_accuracy": 0.7867095470428467, "num_tokens": 1183131.0, "step": 214, "train/ce_loss": 0.8128485679626465 }, { "epoch": 0.02115878979632193, "step": 214, "train/sim_loss": 0.13671875 }, { "epoch": 0.02115878979632193, "step": 214, "train/total_loss": 0.21800360083580017 }, { "entropy": 9.285442352294922, "epoch": 0.021257662645837453, "mean_token_accuracy": 0.7429906725883484, "num_tokens": 1188417.0, "step": 215, "train/ce_loss": 0.9570755362510681 }, { "epoch": 0.021257662645837453, "step": 215, "train/sim_loss": 0.1953125 }, { "epoch": 0.021257662645837453, "step": 215, "train/total_loss": 0.29102006554603577 }, { "entropy": 8.701044082641602, "epoch": 0.021356535495352977, "mean_token_accuracy": 0.7162725925445557, "num_tokens": 1193780.0, "step": 216, "train/ce_loss": 1.1010971069335938 }, { "epoch": 0.021356535495352977, "step": 216, "train/sim_loss": 0.22265625 }, { "epoch": 0.021356535495352977, "step": 216, "train/total_loss": 0.33276596665382385 }, { "entropy": 8.828420639038086, "epoch": 0.0214554083448685, "mean_token_accuracy": 0.7763713002204895, "num_tokens": 1199352.0, "step": 217, "train/ce_loss": 0.5445225834846497 }, { "epoch": 0.0214554083448685, "step": 217, "train/sim_loss": 0.13671875 }, { "epoch": 0.0214554083448685, "step": 217, "train/total_loss": 0.19117100536823273 }, { "entropy": 8.934491157531738, "epoch": 0.02155428119438402, "mean_token_accuracy": 0.7707641124725342, "num_tokens": 1204767.0, "step": 218, "train/ce_loss": 0.8267936110496521 }, { "epoch": 0.02155428119438402, "step": 218, "train/sim_loss": 0.07421875 }, { "epoch": 0.02155428119438402, "step": 218, "train/total_loss": 0.1568981111049652 }, { "entropy": 9.090204238891602, "epoch": 0.021653154043899545, "mean_token_accuracy": 0.6646266579627991, "num_tokens": 1210192.0, "step": 219, "train/ce_loss": 1.394233226776123 }, { "epoch": 0.021653154043899545, "step": 219, "train/sim_loss": 0.1953125 }, { "epoch": 0.021653154043899545, "step": 219, "train/total_loss": 0.33473581075668335 }, { "epoch": 0.02175202689341507, "grad_norm": 1.1519370079040527, "learning_rate": 9.948326163279435e-06, "loss": 0.244, "step": 220 }, { "entropy": 9.10263442993164, "epoch": 0.02175202689341507, "mean_token_accuracy": 0.7162162065505981, "num_tokens": 1215631.0, "step": 220, "train/ce_loss": 0.5412389039993286 }, { "epoch": 0.02175202689341507, "step": 220, "train/sim_loss": 0.140625 }, { "epoch": 0.02175202689341507, "step": 220, "train/total_loss": 0.1947488933801651 }, { "entropy": 8.977800369262695, "epoch": 0.021850899742930592, "mean_token_accuracy": 0.7722660899162292, "num_tokens": 1220994.0, "step": 221, "train/ce_loss": 0.49805518984794617 }, { "epoch": 0.021850899742930592, "step": 221, "train/sim_loss": 0.1484375 }, { "epoch": 0.021850899742930592, "step": 221, "train/total_loss": 0.19824302196502686 }, { "entropy": 8.960020065307617, "epoch": 0.021949772592446116, "mean_token_accuracy": 0.7001153230667114, "num_tokens": 1226454.0, "step": 222, "train/ce_loss": 0.9061692953109741 }, { "epoch": 0.021949772592446116, "step": 222, "train/sim_loss": 0.12890625 }, { "epoch": 0.021949772592446116, "step": 222, "train/total_loss": 0.21952319145202637 }, { "entropy": 8.894418716430664, "epoch": 0.022048645441961636, "mean_token_accuracy": 0.7764127850532532, "num_tokens": 1231881.0, "step": 223, "train/ce_loss": 0.5557315945625305 }, { "epoch": 0.022048645441961636, "step": 223, "train/sim_loss": 0.1640625 }, { "epoch": 0.022048645441961636, "step": 223, "train/total_loss": 0.21963566541671753 }, { "entropy": 9.046100616455078, "epoch": 0.02214751829147716, "mean_token_accuracy": 0.7652284502983093, "num_tokens": 1237240.0, "step": 224, "train/ce_loss": 0.5726690888404846 }, { "epoch": 0.02214751829147716, "step": 224, "train/sim_loss": 0.11328125 }, { "epoch": 0.02214751829147716, "step": 224, "train/total_loss": 0.17054815590381622 }, { "entropy": 8.880996704101562, "epoch": 0.022246391140992684, "mean_token_accuracy": 0.8055555820465088, "num_tokens": 1242708.0, "step": 225, "train/ce_loss": 0.601108193397522 }, { "epoch": 0.022246391140992684, "step": 225, "train/sim_loss": 0.109375 }, { "epoch": 0.022246391140992684, "step": 225, "train/total_loss": 0.16948582231998444 }, { "entropy": 8.795853614807129, "epoch": 0.022345263990508207, "mean_token_accuracy": 0.7090163826942444, "num_tokens": 1248268.0, "step": 226, "train/ce_loss": 0.5646979808807373 }, { "epoch": 0.022345263990508207, "step": 226, "train/sim_loss": 0.11328125 }, { "epoch": 0.022345263990508207, "step": 226, "train/total_loss": 0.16975104808807373 }, { "entropy": 8.672819137573242, "epoch": 0.02244413684002373, "mean_token_accuracy": 0.723809540271759, "num_tokens": 1253909.0, "step": 227, "train/ce_loss": 1.3073190450668335 }, { "epoch": 0.02244413684002373, "step": 227, "train/sim_loss": 0.15234375 }, { "epoch": 0.02244413684002373, "step": 227, "train/total_loss": 0.2830756604671478 }, { "entropy": 9.140840530395508, "epoch": 0.02254300968953925, "mean_token_accuracy": 0.7344912886619568, "num_tokens": 1259357.0, "step": 228, "train/ce_loss": 0.8109186887741089 }, { "epoch": 0.02254300968953925, "step": 228, "train/sim_loss": 0.1796875 }, { "epoch": 0.02254300968953925, "step": 228, "train/total_loss": 0.26077938079833984 }, { "entropy": 9.173983573913574, "epoch": 0.022641882539054775, "mean_token_accuracy": 0.7452152967453003, "num_tokens": 1264815.0, "step": 229, "train/ce_loss": 0.5319539308547974 }, { "epoch": 0.022641882539054775, "step": 229, "train/sim_loss": 0.1796875 }, { "epoch": 0.022641882539054775, "step": 229, "train/total_loss": 0.23288288712501526 }, { "entropy": 9.212177276611328, "epoch": 0.0227407553885703, "mean_token_accuracy": 0.7089715600013733, "num_tokens": 1270438.0, "step": 230, "train/ce_loss": 1.3914085626602173 }, { "epoch": 0.0227407553885703, "step": 230, "train/sim_loss": 0.21875 }, { "epoch": 0.0227407553885703, "step": 230, "train/total_loss": 0.3578908443450928 }, { "entropy": 8.679014205932617, "epoch": 0.022839628238085823, "mean_token_accuracy": 0.71757572889328, "num_tokens": 1275881.0, "step": 231, "train/ce_loss": 0.9911695122718811 }, { "epoch": 0.022839628238085823, "step": 231, "train/sim_loss": 0.203125 }, { "epoch": 0.022839628238085823, "step": 231, "train/total_loss": 0.3022419512271881 }, { "entropy": 9.256446838378906, "epoch": 0.022938501087601346, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 1281125.0, "step": 232, "train/ce_loss": 1.761154055595398 }, { "epoch": 0.022938501087601346, "step": 232, "train/sim_loss": 0.16796875 }, { "epoch": 0.022938501087601346, "step": 232, "train/total_loss": 0.34408414363861084 }, { "entropy": 8.97947883605957, "epoch": 0.023037373937116867, "mean_token_accuracy": 0.7077294588088989, "num_tokens": 1286625.0, "step": 233, "train/ce_loss": 1.1427340507507324 }, { "epoch": 0.023037373937116867, "step": 233, "train/sim_loss": 0.1953125 }, { "epoch": 0.023037373937116867, "step": 233, "train/total_loss": 0.30958589911460876 }, { "entropy": 8.798945426940918, "epoch": 0.02313624678663239, "mean_token_accuracy": 0.7423934936523438, "num_tokens": 1292257.0, "step": 234, "train/ce_loss": 0.7580620646476746 }, { "epoch": 0.02313624678663239, "step": 234, "train/sim_loss": 0.08203125 }, { "epoch": 0.02313624678663239, "step": 234, "train/total_loss": 0.15783745050430298 }, { "entropy": 8.855005264282227, "epoch": 0.023235119636147914, "mean_token_accuracy": 0.7916666865348816, "num_tokens": 1297780.0, "step": 235, "train/ce_loss": 0.8942148685455322 }, { "epoch": 0.023235119636147914, "step": 235, "train/sim_loss": 0.12109375 }, { "epoch": 0.023235119636147914, "step": 235, "train/total_loss": 0.21051523089408875 }, { "entropy": 8.787056922912598, "epoch": 0.023333992485663438, "mean_token_accuracy": 0.7422389388084412, "num_tokens": 1303375.0, "step": 236, "train/ce_loss": 0.6811010837554932 }, { "epoch": 0.023333992485663438, "step": 236, "train/sim_loss": 0.078125 }, { "epoch": 0.023333992485663438, "step": 236, "train/total_loss": 0.14623510837554932 }, { "entropy": 9.170133590698242, "epoch": 0.023432865335178958, "mean_token_accuracy": 0.7535853981971741, "num_tokens": 1308760.0, "step": 237, "train/ce_loss": 0.7095848321914673 }, { "epoch": 0.023432865335178958, "step": 237, "train/sim_loss": 0.1328125 }, { "epoch": 0.023432865335178958, "step": 237, "train/total_loss": 0.20377099514007568 }, { "entropy": 9.261778831481934, "epoch": 0.023531738184694482, "mean_token_accuracy": 0.7182235717773438, "num_tokens": 1314104.0, "step": 238, "train/ce_loss": 1.362142562866211 }, { "epoch": 0.023531738184694482, "step": 238, "train/sim_loss": 0.1640625 }, { "epoch": 0.023531738184694482, "step": 238, "train/total_loss": 0.3002767562866211 }, { "entropy": 9.019259452819824, "epoch": 0.023630611034210006, "mean_token_accuracy": 0.7557172775268555, "num_tokens": 1319664.0, "step": 239, "train/ce_loss": 0.7462193965911865 }, { "epoch": 0.023630611034210006, "step": 239, "train/sim_loss": 0.125 }, { "epoch": 0.023630611034210006, "step": 239, "train/total_loss": 0.19962194561958313 }, { "epoch": 0.02372948388372553, "grad_norm": 1.2561548948287964, "learning_rate": 9.943381298521487e-06, "loss": 0.2256, "step": 240 }, { "entropy": 9.3170747756958, "epoch": 0.02372948388372553, "mean_token_accuracy": 0.7321652173995972, "num_tokens": 1325015.0, "step": 240, "train/ce_loss": 0.3093976080417633 }, { "epoch": 0.02372948388372553, "step": 240, "train/sim_loss": 0.109375 }, { "epoch": 0.02372948388372553, "step": 240, "train/total_loss": 0.1403147578239441 }, { "entropy": 9.059513092041016, "epoch": 0.023828356733241053, "mean_token_accuracy": 0.7055137753486633, "num_tokens": 1330445.0, "step": 241, "train/ce_loss": 0.535310685634613 }, { "epoch": 0.023828356733241053, "step": 241, "train/sim_loss": 0.12109375 }, { "epoch": 0.023828356733241053, "step": 241, "train/total_loss": 0.17462481558322906 }, { "entropy": 8.748943328857422, "epoch": 0.023927229582756573, "mean_token_accuracy": 0.7298850417137146, "num_tokens": 1336074.0, "step": 242, "train/ce_loss": 0.6817721128463745 }, { "epoch": 0.023927229582756573, "step": 242, "train/sim_loss": 0.24609375 }, { "epoch": 0.023927229582756573, "step": 242, "train/total_loss": 0.3142709732055664 }, { "entropy": 8.975515365600586, "epoch": 0.024026102432272097, "mean_token_accuracy": 0.6978417038917542, "num_tokens": 1341545.0, "step": 243, "train/ce_loss": 0.988120973110199 }, { "epoch": 0.024026102432272097, "step": 243, "train/sim_loss": 0.15234375 }, { "epoch": 0.024026102432272097, "step": 243, "train/total_loss": 0.2511558532714844 }, { "entropy": 8.782365798950195, "epoch": 0.02412497528178762, "mean_token_accuracy": 0.7471022009849548, "num_tokens": 1347182.0, "step": 244, "train/ce_loss": 1.439424991607666 }, { "epoch": 0.02412497528178762, "step": 244, "train/sim_loss": 0.171875 }, { "epoch": 0.02412497528178762, "step": 244, "train/total_loss": 0.3158175051212311 }, { "entropy": 9.158496856689453, "epoch": 0.024223848131303145, "mean_token_accuracy": 0.6976470351219177, "num_tokens": 1352691.0, "step": 245, "train/ce_loss": 1.1117689609527588 }, { "epoch": 0.024223848131303145, "step": 245, "train/sim_loss": 0.1640625 }, { "epoch": 0.024223848131303145, "step": 245, "train/total_loss": 0.27523940801620483 }, { "entropy": 9.07068920135498, "epoch": 0.02432272098081867, "mean_token_accuracy": 0.7394015192985535, "num_tokens": 1358106.0, "step": 246, "train/ce_loss": 0.8557369112968445 }, { "epoch": 0.02432272098081867, "step": 246, "train/sim_loss": 0.1640625 }, { "epoch": 0.02432272098081867, "step": 246, "train/total_loss": 0.2496362030506134 }, { "entropy": 8.685247421264648, "epoch": 0.02442159383033419, "mean_token_accuracy": 0.724750280380249, "num_tokens": 1363750.0, "step": 247, "train/ce_loss": 1.4586645364761353 }, { "epoch": 0.02442159383033419, "step": 247, "train/sim_loss": 0.17578125 }, { "epoch": 0.02442159383033419, "step": 247, "train/total_loss": 0.3216477036476135 }, { "entropy": 9.276339530944824, "epoch": 0.024520466679849712, "mean_token_accuracy": 0.7544987201690674, "num_tokens": 1369121.0, "step": 248, "train/ce_loss": 0.8595091700553894 }, { "epoch": 0.024520466679849712, "step": 248, "train/sim_loss": 0.16796875 }, { "epoch": 0.024520466679849712, "step": 248, "train/total_loss": 0.25391966104507446 }, { "entropy": 9.100275993347168, "epoch": 0.024619339529365236, "mean_token_accuracy": 0.6977300047874451, "num_tokens": 1374568.0, "step": 249, "train/ce_loss": 1.0104544162750244 }, { "epoch": 0.024619339529365236, "step": 249, "train/sim_loss": 0.15625 }, { "epoch": 0.024619339529365236, "step": 249, "train/total_loss": 0.2572954297065735 }, { "entropy": 9.006145477294922, "epoch": 0.02471821237888076, "mean_token_accuracy": 0.7450980544090271, "num_tokens": 1380079.0, "step": 250, "train/ce_loss": 0.9873389601707458 }, { "epoch": 0.02471821237888076, "step": 250, "train/sim_loss": 0.14453125 }, { "epoch": 0.02471821237888076, "step": 250, "train/total_loss": 0.24326515197753906 }, { "entropy": 8.80069351196289, "epoch": 0.024817085228396284, "mean_token_accuracy": 0.7277085185050964, "num_tokens": 1385730.0, "step": 251, "train/ce_loss": 1.059699535369873 }, { "epoch": 0.024817085228396284, "step": 251, "train/sim_loss": 0.125 }, { "epoch": 0.024817085228396284, "step": 251, "train/total_loss": 0.23096996545791626 }, { "entropy": 9.072319984436035, "epoch": 0.024915958077911804, "mean_token_accuracy": 0.7182447910308838, "num_tokens": 1391224.0, "step": 252, "train/ce_loss": 1.1458444595336914 }, { "epoch": 0.024915958077911804, "step": 252, "train/sim_loss": 0.16796875 }, { "epoch": 0.024915958077911804, "step": 252, "train/total_loss": 0.28255319595336914 }, { "entropy": 8.788187980651855, "epoch": 0.025014830927427328, "mean_token_accuracy": 0.7741596698760986, "num_tokens": 1396883.0, "step": 253, "train/ce_loss": 0.5919986367225647 }, { "epoch": 0.025014830927427328, "step": 253, "train/sim_loss": 0.17578125 }, { "epoch": 0.025014830927427328, "step": 253, "train/total_loss": 0.23498111963272095 }, { "entropy": 8.874147415161133, "epoch": 0.02511370377694285, "mean_token_accuracy": 0.6985018849372864, "num_tokens": 1402562.0, "step": 254, "train/ce_loss": 1.1492795944213867 }, { "epoch": 0.02511370377694285, "step": 254, "train/sim_loss": 0.1484375 }, { "epoch": 0.02511370377694285, "step": 254, "train/total_loss": 0.2633654475212097 }, { "entropy": 9.182164192199707, "epoch": 0.025212576626458375, "mean_token_accuracy": 0.7575757503509521, "num_tokens": 1407986.0, "step": 255, "train/ce_loss": 0.7376349568367004 }, { "epoch": 0.025212576626458375, "step": 255, "train/sim_loss": 0.140625 }, { "epoch": 0.025212576626458375, "step": 255, "train/total_loss": 0.21438848972320557 }, { "entropy": 9.366623878479004, "epoch": 0.0253114494759739, "mean_token_accuracy": 0.765625, "num_tokens": 1413392.0, "step": 256, "train/ce_loss": 0.6558706760406494 }, { "epoch": 0.0253114494759739, "step": 256, "train/sim_loss": 0.125 }, { "epoch": 0.0253114494759739, "step": 256, "train/total_loss": 0.19058707356452942 }, { "entropy": 9.113519668579102, "epoch": 0.02541032232548942, "mean_token_accuracy": 0.7071078419685364, "num_tokens": 1418805.0, "step": 257, "train/ce_loss": 0.9075577855110168 }, { "epoch": 0.02541032232548942, "step": 257, "train/sim_loss": 0.109375 }, { "epoch": 0.02541032232548942, "step": 257, "train/total_loss": 0.20013079047203064 }, { "entropy": 8.837578773498535, "epoch": 0.025509195175004943, "mean_token_accuracy": 0.7087857723236084, "num_tokens": 1424448.0, "step": 258, "train/ce_loss": 0.6658857464790344 }, { "epoch": 0.025509195175004943, "step": 258, "train/sim_loss": 0.24609375 }, { "epoch": 0.025509195175004943, "step": 258, "train/total_loss": 0.3126823306083679 }, { "entropy": 8.924936294555664, "epoch": 0.025608068024520467, "mean_token_accuracy": 0.7170417904853821, "num_tokens": 1430064.0, "step": 259, "train/ce_loss": 0.5701555013656616 }, { "epoch": 0.025608068024520467, "step": 259, "train/sim_loss": 0.15234375 }, { "epoch": 0.025608068024520467, "step": 259, "train/total_loss": 0.2093593031167984 }, { "epoch": 0.02570694087403599, "grad_norm": 1.1377497911453247, "learning_rate": 9.938436433763537e-06, "loss": 0.2409, "step": 260 }, { "entropy": 8.959202766418457, "epoch": 0.02570694087403599, "mean_token_accuracy": 0.7394871711730957, "num_tokens": 1435634.0, "step": 260, "train/ce_loss": 0.7609110474586487 }, { "epoch": 0.02570694087403599, "step": 260, "train/sim_loss": 0.15234375 }, { "epoch": 0.02570694087403599, "step": 260, "train/total_loss": 0.22843486070632935 }, { "entropy": 9.163043975830078, "epoch": 0.025805813723551514, "mean_token_accuracy": 0.7459016442298889, "num_tokens": 1441155.0, "step": 261, "train/ce_loss": 0.7729380130767822 }, { "epoch": 0.025805813723551514, "step": 261, "train/sim_loss": 0.1171875 }, { "epoch": 0.025805813723551514, "step": 261, "train/total_loss": 0.19448131322860718 }, { "entropy": 8.960432052612305, "epoch": 0.025904686573067034, "mean_token_accuracy": 0.6906318068504333, "num_tokens": 1446686.0, "step": 262, "train/ce_loss": 1.7350761890411377 }, { "epoch": 0.025904686573067034, "step": 262, "train/sim_loss": 0.18359375 }, { "epoch": 0.025904686573067034, "step": 262, "train/total_loss": 0.3571013808250427 }, { "entropy": 9.366714477539062, "epoch": 0.026003559422582558, "mean_token_accuracy": 0.6954612135887146, "num_tokens": 1451996.0, "step": 263, "train/ce_loss": 1.1510272026062012 }, { "epoch": 0.026003559422582558, "step": 263, "train/sim_loss": 0.13671875 }, { "epoch": 0.026003559422582558, "step": 263, "train/total_loss": 0.25182145833969116 }, { "entropy": 9.147409439086914, "epoch": 0.026102432272098082, "mean_token_accuracy": 0.7030033469200134, "num_tokens": 1457420.0, "step": 264, "train/ce_loss": 1.157666802406311 }, { "epoch": 0.026102432272098082, "step": 264, "train/sim_loss": 0.21875 }, { "epoch": 0.026102432272098082, "step": 264, "train/total_loss": 0.3345166742801666 }, { "entropy": 9.276344299316406, "epoch": 0.026201305121613606, "mean_token_accuracy": 0.7247259616851807, "num_tokens": 1462781.0, "step": 265, "train/ce_loss": 0.8856678605079651 }, { "epoch": 0.026201305121613606, "step": 265, "train/sim_loss": 0.15625 }, { "epoch": 0.026201305121613606, "step": 265, "train/total_loss": 0.24481678009033203 }, { "entropy": 8.996925354003906, "epoch": 0.02630017797112913, "mean_token_accuracy": 0.6948979496955872, "num_tokens": 1468410.0, "step": 266, "train/ce_loss": 0.8653662204742432 }, { "epoch": 0.02630017797112913, "step": 266, "train/sim_loss": 0.22265625 }, { "epoch": 0.02630017797112913, "step": 266, "train/total_loss": 0.30919286608695984 }, { "entropy": 9.15163803100586, "epoch": 0.02639905082064465, "mean_token_accuracy": 0.7523474097251892, "num_tokens": 1473904.0, "step": 267, "train/ce_loss": 0.6376057863235474 }, { "epoch": 0.02639905082064465, "step": 267, "train/sim_loss": 0.16796875 }, { "epoch": 0.02639905082064465, "step": 267, "train/total_loss": 0.23172932863235474 }, { "entropy": 8.997777938842773, "epoch": 0.026497923670160173, "mean_token_accuracy": 0.7226606607437134, "num_tokens": 1479374.0, "step": 268, "train/ce_loss": 0.5620176792144775 }, { "epoch": 0.026497923670160173, "step": 268, "train/sim_loss": 0.14453125 }, { "epoch": 0.026497923670160173, "step": 268, "train/total_loss": 0.20073302090168 }, { "entropy": 9.093475341796875, "epoch": 0.026596796519675697, "mean_token_accuracy": 0.7729306221008301, "num_tokens": 1484913.0, "step": 269, "train/ce_loss": 0.9793072938919067 }, { "epoch": 0.026596796519675697, "step": 269, "train/sim_loss": 0.1328125 }, { "epoch": 0.026596796519675697, "step": 269, "train/total_loss": 0.23074322938919067 }, { "entropy": 9.119166374206543, "epoch": 0.02669566936919122, "mean_token_accuracy": 0.7147971391677856, "num_tokens": 1490345.0, "step": 270, "train/ce_loss": 1.3623946905136108 }, { "epoch": 0.02669566936919122, "step": 270, "train/sim_loss": 0.140625 }, { "epoch": 0.02669566936919122, "step": 270, "train/total_loss": 0.2768644690513611 }, { "entropy": 8.525182723999023, "epoch": 0.026794542218706745, "mean_token_accuracy": 0.7129543423652649, "num_tokens": 1496107.0, "step": 271, "train/ce_loss": 1.1013684272766113 }, { "epoch": 0.026794542218706745, "step": 271, "train/sim_loss": 0.15625 }, { "epoch": 0.026794542218706745, "step": 271, "train/total_loss": 0.26638683676719666 }, { "entropy": 8.858224868774414, "epoch": 0.026893415068222265, "mean_token_accuracy": 0.7253086566925049, "num_tokens": 1501719.0, "step": 272, "train/ce_loss": 1.116441249847412 }, { "epoch": 0.026893415068222265, "step": 272, "train/sim_loss": 0.15234375 }, { "epoch": 0.026893415068222265, "step": 272, "train/total_loss": 0.26398786902427673 }, { "entropy": 9.096494674682617, "epoch": 0.02699228791773779, "mean_token_accuracy": 0.7254902124404907, "num_tokens": 1507146.0, "step": 273, "train/ce_loss": 0.477142870426178 }, { "epoch": 0.02699228791773779, "step": 273, "train/sim_loss": 0.09375 }, { "epoch": 0.02699228791773779, "step": 273, "train/total_loss": 0.14146429300308228 }, { "entropy": 9.10273265838623, "epoch": 0.027091160767253313, "mean_token_accuracy": 0.7545661926269531, "num_tokens": 1512584.0, "step": 274, "train/ce_loss": 0.8062126636505127 }, { "epoch": 0.027091160767253313, "step": 274, "train/sim_loss": 0.1953125 }, { "epoch": 0.027091160767253313, "step": 274, "train/total_loss": 0.27593377232551575 }, { "entropy": 8.892921447753906, "epoch": 0.027190033616768836, "mean_token_accuracy": 0.6948955655097961, "num_tokens": 1518091.0, "step": 275, "train/ce_loss": 1.4555143117904663 }, { "epoch": 0.027190033616768836, "step": 275, "train/sim_loss": 0.1640625 }, { "epoch": 0.027190033616768836, "step": 275, "train/total_loss": 0.3096139430999756 }, { "entropy": 9.080606460571289, "epoch": 0.02728890646628436, "mean_token_accuracy": 0.7821297645568848, "num_tokens": 1523530.0, "step": 276, "train/ce_loss": 0.7964259386062622 }, { "epoch": 0.02728890646628436, "step": 276, "train/sim_loss": 0.12109375 }, { "epoch": 0.02728890646628436, "step": 276, "train/total_loss": 0.20073634386062622 }, { "entropy": 9.150467872619629, "epoch": 0.02738777931579988, "mean_token_accuracy": 0.7022472023963928, "num_tokens": 1529090.0, "step": 277, "train/ce_loss": 0.35567423701286316 }, { "epoch": 0.02738777931579988, "step": 277, "train/sim_loss": 0.10546875 }, { "epoch": 0.02738777931579988, "step": 277, "train/total_loss": 0.14103618264198303 }, { "entropy": 8.955547332763672, "epoch": 0.027486652165315404, "mean_token_accuracy": 0.7954545617103577, "num_tokens": 1534666.0, "step": 278, "train/ce_loss": 0.6366835832595825 }, { "epoch": 0.027486652165315404, "step": 278, "train/sim_loss": 0.09765625 }, { "epoch": 0.027486652165315404, "step": 278, "train/total_loss": 0.1613246202468872 }, { "entropy": 9.016998291015625, "epoch": 0.027585525014830928, "mean_token_accuracy": 0.7088204026222229, "num_tokens": 1540184.0, "step": 279, "train/ce_loss": 0.9254931211471558 }, { "epoch": 0.027585525014830928, "step": 279, "train/sim_loss": 0.14453125 }, { "epoch": 0.027585525014830928, "step": 279, "train/total_loss": 0.23708057403564453 }, { "epoch": 0.02768439786434645, "grad_norm": 1.3412216901779175, "learning_rate": 9.93349156900559e-06, "loss": 0.2252, "step": 280 }, { "entropy": 8.941593170166016, "epoch": 0.02768439786434645, "mean_token_accuracy": 0.743047833442688, "num_tokens": 1545727.0, "step": 280, "train/ce_loss": 1.0981823205947876 }, { "epoch": 0.02768439786434645, "step": 280, "train/sim_loss": 0.13671875 }, { "epoch": 0.02768439786434645, "step": 280, "train/total_loss": 0.246536985039711 }, { "entropy": 8.967143058776855, "epoch": 0.027783270713861972, "mean_token_accuracy": 0.7211111187934875, "num_tokens": 1551281.0, "step": 281, "train/ce_loss": 0.568958580493927 }, { "epoch": 0.027783270713861972, "step": 281, "train/sim_loss": 0.125 }, { "epoch": 0.027783270713861972, "step": 281, "train/total_loss": 0.18189585208892822 }, { "entropy": 9.091773986816406, "epoch": 0.027882143563377496, "mean_token_accuracy": 0.7215909361839294, "num_tokens": 1556779.0, "step": 282, "train/ce_loss": 0.7791494727134705 }, { "epoch": 0.027882143563377496, "step": 282, "train/sim_loss": 0.12890625 }, { "epoch": 0.027882143563377496, "step": 282, "train/total_loss": 0.20682120323181152 }, { "entropy": 9.319311141967773, "epoch": 0.02798101641289302, "mean_token_accuracy": 0.7669270634651184, "num_tokens": 1562060.0, "step": 283, "train/ce_loss": 0.9666864275932312 }, { "epoch": 0.02798101641289302, "step": 283, "train/sim_loss": 0.10546875 }, { "epoch": 0.02798101641289302, "step": 283, "train/total_loss": 0.20213739573955536 }, { "entropy": 9.076925277709961, "epoch": 0.028079889262408543, "mean_token_accuracy": 0.7902494072914124, "num_tokens": 1567567.0, "step": 284, "train/ce_loss": 0.8583142161369324 }, { "epoch": 0.028079889262408543, "step": 284, "train/sim_loss": 0.1640625 }, { "epoch": 0.028079889262408543, "step": 284, "train/total_loss": 0.2498939335346222 }, { "entropy": 8.89913558959961, "epoch": 0.028178762111924067, "mean_token_accuracy": 0.7678571343421936, "num_tokens": 1573094.0, "step": 285, "train/ce_loss": 0.8704280257225037 }, { "epoch": 0.028178762111924067, "step": 285, "train/sim_loss": 0.1015625 }, { "epoch": 0.028178762111924067, "step": 285, "train/total_loss": 0.18860530853271484 }, { "entropy": 9.357723236083984, "epoch": 0.028277634961439587, "mean_token_accuracy": 0.7113665342330933, "num_tokens": 1578542.0, "step": 286, "train/ce_loss": 0.9495486617088318 }, { "epoch": 0.028277634961439587, "step": 286, "train/sim_loss": 0.140625 }, { "epoch": 0.028277634961439587, "step": 286, "train/total_loss": 0.23557987809181213 }, { "entropy": 8.983036041259766, "epoch": 0.02837650781095511, "mean_token_accuracy": 0.78899085521698, "num_tokens": 1584031.0, "step": 287, "train/ce_loss": 1.0337260961532593 }, { "epoch": 0.02837650781095511, "step": 287, "train/sim_loss": 0.18359375 }, { "epoch": 0.02837650781095511, "step": 287, "train/total_loss": 0.28696635365486145 }, { "entropy": 9.446710586547852, "epoch": 0.028475380660470635, "mean_token_accuracy": 0.7503392100334167, "num_tokens": 1589308.0, "step": 288, "train/ce_loss": 0.6321821212768555 }, { "epoch": 0.028475380660470635, "step": 288, "train/sim_loss": 0.078125 }, { "epoch": 0.028475380660470635, "step": 288, "train/total_loss": 0.14134320616722107 }, { "entropy": 9.170350074768066, "epoch": 0.02857425350998616, "mean_token_accuracy": 0.7217597961425781, "num_tokens": 1594798.0, "step": 289, "train/ce_loss": 1.278653860092163 }, { "epoch": 0.02857425350998616, "step": 289, "train/sim_loss": 0.1640625 }, { "epoch": 0.02857425350998616, "step": 289, "train/total_loss": 0.29192787408828735 }, { "entropy": 9.309854507446289, "epoch": 0.028673126359501682, "mean_token_accuracy": 0.7394015192985535, "num_tokens": 1600235.0, "step": 290, "train/ce_loss": 0.9589440226554871 }, { "epoch": 0.028673126359501682, "step": 290, "train/sim_loss": 0.0859375 }, { "epoch": 0.028673126359501682, "step": 290, "train/total_loss": 0.18183189630508423 }, { "entropy": 9.106192588806152, "epoch": 0.028771999209017202, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 1605648.0, "step": 291, "train/ce_loss": 0.6705142855644226 }, { "epoch": 0.028771999209017202, "step": 291, "train/sim_loss": 0.046875 }, { "epoch": 0.028771999209017202, "step": 291, "train/total_loss": 0.11392643302679062 }, { "entropy": 9.344202041625977, "epoch": 0.028870872058532726, "mean_token_accuracy": 0.7228116989135742, "num_tokens": 1611017.0, "step": 292, "train/ce_loss": 1.1649410724639893 }, { "epoch": 0.028870872058532726, "step": 292, "train/sim_loss": 0.16796875 }, { "epoch": 0.028870872058532726, "step": 292, "train/total_loss": 0.2844628691673279 }, { "entropy": 9.250099182128906, "epoch": 0.02896974490804825, "mean_token_accuracy": 0.7506265640258789, "num_tokens": 1616492.0, "step": 293, "train/ce_loss": 0.6821977496147156 }, { "epoch": 0.02896974490804825, "step": 293, "train/sim_loss": 0.171875 }, { "epoch": 0.02896974490804825, "step": 293, "train/total_loss": 0.24009478092193604 }, { "entropy": 8.81739330291748, "epoch": 0.029068617757563774, "mean_token_accuracy": 0.7145488262176514, "num_tokens": 1622128.0, "step": 294, "train/ce_loss": 1.0614161491394043 }, { "epoch": 0.029068617757563774, "step": 294, "train/sim_loss": 0.10546875 }, { "epoch": 0.029068617757563774, "step": 294, "train/total_loss": 0.21161037683486938 }, { "entropy": 8.974885940551758, "epoch": 0.029167490607079297, "mean_token_accuracy": 0.7277542352676392, "num_tokens": 1627666.0, "step": 295, "train/ce_loss": 1.0640629529953003 }, { "epoch": 0.029167490607079297, "step": 295, "train/sim_loss": 0.12109375 }, { "epoch": 0.029167490607079297, "step": 295, "train/total_loss": 0.2275000512599945 }, { "entropy": 9.108479499816895, "epoch": 0.029266363456594818, "mean_token_accuracy": 0.7479079365730286, "num_tokens": 1633204.0, "step": 296, "train/ce_loss": 0.7535548806190491 }, { "epoch": 0.029266363456594818, "step": 296, "train/sim_loss": 0.10546875 }, { "epoch": 0.029266363456594818, "step": 296, "train/total_loss": 0.18082424998283386 }, { "entropy": 9.045448303222656, "epoch": 0.02936523630611034, "mean_token_accuracy": 0.7249034643173218, "num_tokens": 1638798.0, "step": 297, "train/ce_loss": 0.2908283770084381 }, { "epoch": 0.02936523630611034, "step": 297, "train/sim_loss": 0.0859375 }, { "epoch": 0.02936523630611034, "step": 297, "train/total_loss": 0.11502033472061157 }, { "entropy": 8.81541633605957, "epoch": 0.029464109155625865, "mean_token_accuracy": 0.6783625483512878, "num_tokens": 1644491.0, "step": 298, "train/ce_loss": 0.7956336736679077 }, { "epoch": 0.029464109155625865, "step": 298, "train/sim_loss": 0.140625 }, { "epoch": 0.029464109155625865, "step": 298, "train/total_loss": 0.22018837928771973 }, { "entropy": 9.319121360778809, "epoch": 0.02956298200514139, "mean_token_accuracy": 0.6982543468475342, "num_tokens": 1649853.0, "step": 299, "train/ce_loss": 0.7503468990325928 }, { "epoch": 0.02956298200514139, "step": 299, "train/sim_loss": 0.1484375 }, { "epoch": 0.02956298200514139, "step": 299, "train/total_loss": 0.22347219288349152 }, { "epoch": 0.029661854854656913, "grad_norm": 1.241660237312317, "learning_rate": 9.928546704247638e-06, "loss": 0.214, "step": 300 }, { "entropy": 9.196109771728516, "epoch": 0.029661854854656913, "mean_token_accuracy": 0.7450549602508545, "num_tokens": 1655408.0, "step": 300, "train/ce_loss": 0.6492835283279419 }, { "epoch": 0.029661854854656913, "step": 300, "train/sim_loss": 0.16796875 }, { "epoch": 0.029661854854656913, "step": 300, "train/total_loss": 0.2328971028327942 }, { "entropy": 9.381685256958008, "epoch": 0.029760727704172433, "mean_token_accuracy": 0.6731266379356384, "num_tokens": 1660773.0, "step": 301, "train/ce_loss": 0.9430516958236694 }, { "epoch": 0.029760727704172433, "step": 301, "train/sim_loss": 0.17578125 }, { "epoch": 0.029760727704172433, "step": 301, "train/total_loss": 0.270086407661438 }, { "entropy": 9.334796905517578, "epoch": 0.029859600553687957, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 1666223.0, "step": 302, "train/ce_loss": 1.020687222480774 }, { "epoch": 0.029859600553687957, "step": 302, "train/sim_loss": 0.14453125 }, { "epoch": 0.029859600553687957, "step": 302, "train/total_loss": 0.2465999722480774 }, { "entropy": 9.052467346191406, "epoch": 0.02995847340320348, "mean_token_accuracy": 0.7248061895370483, "num_tokens": 1671902.0, "step": 303, "train/ce_loss": 0.6706146001815796 }, { "epoch": 0.02995847340320348, "step": 303, "train/sim_loss": 0.12109375 }, { "epoch": 0.02995847340320348, "step": 303, "train/total_loss": 0.18815520405769348 }, { "entropy": 9.248624801635742, "epoch": 0.030057346252719004, "mean_token_accuracy": 0.6955267190933228, "num_tokens": 1677214.0, "step": 304, "train/ce_loss": 0.5633192658424377 }, { "epoch": 0.030057346252719004, "step": 304, "train/sim_loss": 0.15625 }, { "epoch": 0.030057346252719004, "step": 304, "train/total_loss": 0.21258193254470825 }, { "entropy": 9.261011123657227, "epoch": 0.030156219102234528, "mean_token_accuracy": 0.7319098711013794, "num_tokens": 1682629.0, "step": 305, "train/ce_loss": 0.8646674752235413 }, { "epoch": 0.030156219102234528, "step": 305, "train/sim_loss": 0.171875 }, { "epoch": 0.030156219102234528, "step": 305, "train/total_loss": 0.2583417594432831 }, { "entropy": 9.378775596618652, "epoch": 0.030255091951750048, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 1687962.0, "step": 306, "train/ce_loss": 1.0545510053634644 }, { "epoch": 0.030255091951750048, "step": 306, "train/sim_loss": 0.14453125 }, { "epoch": 0.030255091951750048, "step": 306, "train/total_loss": 0.24998635053634644 }, { "entropy": 8.19654655456543, "epoch": 0.030353964801265572, "mean_token_accuracy": 0.7077385187149048, "num_tokens": 1693891.0, "step": 307, "train/ce_loss": 0.41341182589530945 }, { "epoch": 0.030353964801265572, "step": 307, "train/sim_loss": 0.06640625 }, { "epoch": 0.030353964801265572, "step": 307, "train/total_loss": 0.10774743556976318 }, { "entropy": 9.193519592285156, "epoch": 0.030452837650781096, "mean_token_accuracy": 0.7269503474235535, "num_tokens": 1699358.0, "step": 308, "train/ce_loss": 0.7965303063392639 }, { "epoch": 0.030452837650781096, "step": 308, "train/sim_loss": 0.12890625 }, { "epoch": 0.030452837650781096, "step": 308, "train/total_loss": 0.20855927467346191 }, { "entropy": 9.180891036987305, "epoch": 0.03055171050029662, "mean_token_accuracy": 0.7491207718849182, "num_tokens": 1704763.0, "step": 309, "train/ce_loss": 0.9473808407783508 }, { "epoch": 0.03055171050029662, "step": 309, "train/sim_loss": 0.125 }, { "epoch": 0.03055171050029662, "step": 309, "train/total_loss": 0.21973809599876404 }, { "entropy": 9.06826400756836, "epoch": 0.030650583349812143, "mean_token_accuracy": 0.7599591612815857, "num_tokens": 1710380.0, "step": 310, "train/ce_loss": 1.0120995044708252 }, { "epoch": 0.030650583349812143, "step": 310, "train/sim_loss": 0.1171875 }, { "epoch": 0.030650583349812143, "step": 310, "train/total_loss": 0.21839745342731476 }, { "entropy": 9.114416122436523, "epoch": 0.030749456199327663, "mean_token_accuracy": 0.7445175647735596, "num_tokens": 1715901.0, "step": 311, "train/ce_loss": 1.0061215162277222 }, { "epoch": 0.030749456199327663, "step": 311, "train/sim_loss": 0.14453125 }, { "epoch": 0.030749456199327663, "step": 311, "train/total_loss": 0.24514341354370117 }, { "entropy": 8.9130859375, "epoch": 0.030848329048843187, "mean_token_accuracy": 0.7529411911964417, "num_tokens": 1721554.0, "step": 312, "train/ce_loss": 0.7147780656814575 }, { "epoch": 0.030848329048843187, "step": 312, "train/sim_loss": 0.12109375 }, { "epoch": 0.030848329048843187, "step": 312, "train/total_loss": 0.19257155060768127 }, { "entropy": 9.052206993103027, "epoch": 0.03094720189835871, "mean_token_accuracy": 0.7632075548171997, "num_tokens": 1727179.0, "step": 313, "train/ce_loss": 0.6037774085998535 }, { "epoch": 0.03094720189835871, "step": 313, "train/sim_loss": 0.1171875 }, { "epoch": 0.03094720189835871, "step": 313, "train/total_loss": 0.17756524682044983 }, { "entropy": 9.285832405090332, "epoch": 0.031046074747874235, "mean_token_accuracy": 0.6831682920455933, "num_tokens": 1732575.0, "step": 314, "train/ce_loss": 0.926917552947998 }, { "epoch": 0.031046074747874235, "step": 314, "train/sim_loss": 0.14453125 }, { "epoch": 0.031046074747874235, "step": 314, "train/total_loss": 0.23722299933433533 }, { "entropy": 9.342964172363281, "epoch": 0.03114494759738976, "mean_token_accuracy": 0.7083854675292969, "num_tokens": 1738002.0, "step": 315, "train/ce_loss": 1.0450477600097656 }, { "epoch": 0.03114494759738976, "step": 315, "train/sim_loss": 0.08203125 }, { "epoch": 0.03114494759738976, "step": 315, "train/total_loss": 0.1865360289812088 }, { "entropy": 9.53966999053955, "epoch": 0.03124382044690528, "mean_token_accuracy": 0.7415565252304077, "num_tokens": 1743311.0, "step": 316, "train/ce_loss": 0.920505702495575 }, { "epoch": 0.03124382044690528, "step": 316, "train/sim_loss": 0.1328125 }, { "epoch": 0.03124382044690528, "step": 316, "train/total_loss": 0.22486308217048645 }, { "entropy": 9.336041450500488, "epoch": 0.0313426932964208, "mean_token_accuracy": 0.678618848323822, "num_tokens": 1748701.0, "step": 317, "train/ce_loss": 0.69814133644104 }, { "epoch": 0.0313426932964208, "step": 317, "train/sim_loss": 0.09765625 }, { "epoch": 0.0313426932964208, "step": 317, "train/total_loss": 0.16747039556503296 }, { "entropy": 9.401836395263672, "epoch": 0.03144156614593632, "mean_token_accuracy": 0.752439022064209, "num_tokens": 1754079.0, "step": 318, "train/ce_loss": 1.2298121452331543 }, { "epoch": 0.03144156614593632, "step": 318, "train/sim_loss": 0.12890625 }, { "epoch": 0.03144156614593632, "step": 318, "train/total_loss": 0.2518874704837799 }, { "entropy": 9.416969299316406, "epoch": 0.03154043899545185, "mean_token_accuracy": 0.7812061905860901, "num_tokens": 1759459.0, "step": 319, "train/ce_loss": 0.9361550211906433 }, { "epoch": 0.03154043899545185, "step": 319, "train/sim_loss": 0.18359375 }, { "epoch": 0.03154043899545185, "step": 319, "train/total_loss": 0.27720925211906433 }, { "epoch": 0.03163931184496737, "grad_norm": 1.258382797241211, "learning_rate": 9.92360183948969e-06, "loss": 0.2246, "step": 320 }, { "entropy": 8.909311294555664, "epoch": 0.03163931184496737, "mean_token_accuracy": 0.7323232293128967, "num_tokens": 1765083.0, "step": 320, "train/ce_loss": 0.8755263090133667 }, { "epoch": 0.03163931184496737, "step": 320, "train/sim_loss": 0.1171875 }, { "epoch": 0.03163931184496737, "step": 320, "train/total_loss": 0.20474013686180115 }, { "entropy": 9.362167358398438, "epoch": 0.0317381846944829, "mean_token_accuracy": 0.7516340017318726, "num_tokens": 1770468.0, "step": 321, "train/ce_loss": 0.5513641834259033 }, { "epoch": 0.0317381846944829, "step": 321, "train/sim_loss": 0.15625 }, { "epoch": 0.0317381846944829, "step": 321, "train/total_loss": 0.21138641238212585 }, { "entropy": 9.476106643676758, "epoch": 0.03183705754399842, "mean_token_accuracy": 0.7308743000030518, "num_tokens": 1775822.0, "step": 322, "train/ce_loss": 0.8285568952560425 }, { "epoch": 0.03183705754399842, "step": 322, "train/sim_loss": 0.0859375 }, { "epoch": 0.03183705754399842, "step": 322, "train/total_loss": 0.1687932014465332 }, { "entropy": 8.8974609375, "epoch": 0.03193593039351394, "mean_token_accuracy": 0.7006610035896301, "num_tokens": 1781502.0, "step": 323, "train/ce_loss": 0.9600048065185547 }, { "epoch": 0.03193593039351394, "step": 323, "train/sim_loss": 0.18359375 }, { "epoch": 0.03193593039351394, "step": 323, "train/total_loss": 0.2795942425727844 }, { "entropy": 8.829795837402344, "epoch": 0.032034803243029465, "mean_token_accuracy": 0.6739961504936218, "num_tokens": 1787194.0, "step": 324, "train/ce_loss": 1.4080110788345337 }, { "epoch": 0.032034803243029465, "step": 324, "train/sim_loss": 0.1953125 }, { "epoch": 0.032034803243029465, "step": 324, "train/total_loss": 0.3361136317253113 }, { "entropy": 9.36574649810791, "epoch": 0.032133676092544985, "mean_token_accuracy": 0.7472923994064331, "num_tokens": 1792526.0, "step": 325, "train/ce_loss": 0.6475684642791748 }, { "epoch": 0.032133676092544985, "step": 325, "train/sim_loss": 0.06640625 }, { "epoch": 0.032133676092544985, "step": 325, "train/total_loss": 0.131163090467453 }, { "entropy": 9.103139877319336, "epoch": 0.03223254894206051, "mean_token_accuracy": 0.6907756924629211, "num_tokens": 1798134.0, "step": 326, "train/ce_loss": 0.8041818737983704 }, { "epoch": 0.03223254894206051, "step": 326, "train/sim_loss": 0.11328125 }, { "epoch": 0.03223254894206051, "step": 326, "train/total_loss": 0.193699449300766 }, { "entropy": 9.089548110961914, "epoch": 0.03233142179157603, "mean_token_accuracy": 0.6750547289848328, "num_tokens": 1803676.0, "step": 327, "train/ce_loss": 1.1972815990447998 }, { "epoch": 0.03233142179157603, "step": 327, "train/sim_loss": 0.1484375 }, { "epoch": 0.03233142179157603, "step": 327, "train/total_loss": 0.268165647983551 }, { "entropy": 9.314996719360352, "epoch": 0.03243029464109155, "mean_token_accuracy": 0.6884875893592834, "num_tokens": 1809218.0, "step": 328, "train/ce_loss": 0.8321152329444885 }, { "epoch": 0.03243029464109155, "step": 328, "train/sim_loss": 0.1171875 }, { "epoch": 0.03243029464109155, "step": 328, "train/total_loss": 0.2003990262746811 }, { "entropy": 9.47107982635498, "epoch": 0.03252916749060708, "mean_token_accuracy": 0.7665847539901733, "num_tokens": 1814654.0, "step": 329, "train/ce_loss": 0.8651570677757263 }, { "epoch": 0.03252916749060708, "step": 329, "train/sim_loss": 0.09375 }, { "epoch": 0.03252916749060708, "step": 329, "train/total_loss": 0.18026570975780487 }, { "entropy": 9.399576187133789, "epoch": 0.0326280403401226, "mean_token_accuracy": 0.725568950176239, "num_tokens": 1820064.0, "step": 330, "train/ce_loss": 0.8274539113044739 }, { "epoch": 0.0326280403401226, "step": 330, "train/sim_loss": 0.15234375 }, { "epoch": 0.0326280403401226, "step": 330, "train/total_loss": 0.23508915305137634 }, { "entropy": 9.200662612915039, "epoch": 0.03272691318963813, "mean_token_accuracy": 0.675000011920929, "num_tokens": 1825537.0, "step": 331, "train/ce_loss": 0.8937278985977173 }, { "epoch": 0.03272691318963813, "step": 331, "train/sim_loss": 0.12109375 }, { "epoch": 0.03272691318963813, "step": 331, "train/total_loss": 0.21046653389930725 }, { "entropy": 9.259733200073242, "epoch": 0.03282578603915365, "mean_token_accuracy": 0.7313432693481445, "num_tokens": 1831037.0, "step": 332, "train/ce_loss": 0.9736483693122864 }, { "epoch": 0.03282578603915365, "step": 332, "train/sim_loss": 0.14453125 }, { "epoch": 0.03282578603915365, "step": 332, "train/total_loss": 0.24189609289169312 }, { "entropy": 9.412879943847656, "epoch": 0.03292465888866917, "mean_token_accuracy": 0.7555012106895447, "num_tokens": 1836443.0, "step": 333, "train/ce_loss": 0.5263305306434631 }, { "epoch": 0.03292465888866917, "step": 333, "train/sim_loss": 0.10546875 }, { "epoch": 0.03292465888866917, "step": 333, "train/total_loss": 0.15810179710388184 }, { "entropy": 9.206225395202637, "epoch": 0.033023531738184696, "mean_token_accuracy": 0.734649121761322, "num_tokens": 1841927.0, "step": 334, "train/ce_loss": 1.1871784925460815 }, { "epoch": 0.033023531738184696, "step": 334, "train/sim_loss": 0.11328125 }, { "epoch": 0.033023531738184696, "step": 334, "train/total_loss": 0.23199909925460815 }, { "entropy": 9.35094165802002, "epoch": 0.033122404587700216, "mean_token_accuracy": 0.773809552192688, "num_tokens": 1847427.0, "step": 335, "train/ce_loss": 1.0286810398101807 }, { "epoch": 0.033122404587700216, "step": 335, "train/sim_loss": 0.140625 }, { "epoch": 0.033122404587700216, "step": 335, "train/total_loss": 0.24349310994148254 }, { "entropy": 9.458587646484375, "epoch": 0.03322127743721574, "mean_token_accuracy": 0.7605633735656738, "num_tokens": 1852825.0, "step": 336, "train/ce_loss": 0.7521236538887024 }, { "epoch": 0.03322127743721574, "step": 336, "train/sim_loss": 0.1171875 }, { "epoch": 0.03322127743721574, "step": 336, "train/total_loss": 0.19239985942840576 }, { "entropy": 9.293302536010742, "epoch": 0.033320150286731264, "mean_token_accuracy": 0.7087053656578064, "num_tokens": 1858384.0, "step": 337, "train/ce_loss": 1.624673843383789 }, { "epoch": 0.033320150286731264, "step": 337, "train/sim_loss": 0.22265625 }, { "epoch": 0.033320150286731264, "step": 337, "train/total_loss": 0.3851236402988434 }, { "entropy": 9.299514770507812, "epoch": 0.033419023136246784, "mean_token_accuracy": 0.7670127153396606, "num_tokens": 1863836.0, "step": 338, "train/ce_loss": 0.4640211760997772 }, { "epoch": 0.033419023136246784, "step": 338, "train/sim_loss": 0.0546875 }, { "epoch": 0.033419023136246784, "step": 338, "train/total_loss": 0.10108961910009384 }, { "entropy": 9.195375442504883, "epoch": 0.03351789598576231, "mean_token_accuracy": 0.7524752616882324, "num_tokens": 1869376.0, "step": 339, "train/ce_loss": 1.6604721546173096 }, { "epoch": 0.03351789598576231, "step": 339, "train/sim_loss": 0.125 }, { "epoch": 0.03351789598576231, "step": 339, "train/total_loss": 0.29104721546173096 }, { "epoch": 0.03361676883527783, "grad_norm": 1.5418176651000977, "learning_rate": 9.918656974731741e-06, "loss": 0.2199, "step": 340 }, { "entropy": 9.513364791870117, "epoch": 0.03361676883527783, "mean_token_accuracy": 0.7271540760993958, "num_tokens": 1874772.0, "step": 340, "train/ce_loss": 1.0126936435699463 }, { "epoch": 0.03361676883527783, "step": 340, "train/sim_loss": 0.171875 }, { "epoch": 0.03361676883527783, "step": 340, "train/total_loss": 0.27314436435699463 }, { "entropy": 9.085233688354492, "epoch": 0.03371564168479336, "mean_token_accuracy": 0.764397919178009, "num_tokens": 1880328.0, "step": 341, "train/ce_loss": 1.032720685005188 }, { "epoch": 0.03371564168479336, "step": 341, "train/sim_loss": 0.15625 }, { "epoch": 0.03371564168479336, "step": 341, "train/total_loss": 0.25952208042144775 }, { "entropy": 9.572994232177734, "epoch": 0.03381451453430888, "mean_token_accuracy": 0.7737321257591248, "num_tokens": 1885832.0, "step": 342, "train/ce_loss": 0.6404688358306885 }, { "epoch": 0.03381451453430888, "step": 342, "train/sim_loss": 0.078125 }, { "epoch": 0.03381451453430888, "step": 342, "train/total_loss": 0.14217188954353333 }, { "entropy": 9.075947761535645, "epoch": 0.0339133873838244, "mean_token_accuracy": 0.7789598107337952, "num_tokens": 1891314.0, "step": 343, "train/ce_loss": 0.530364990234375 }, { "epoch": 0.0339133873838244, "step": 343, "train/sim_loss": 0.140625 }, { "epoch": 0.0339133873838244, "step": 343, "train/total_loss": 0.19366149604320526 }, { "entropy": 9.484125137329102, "epoch": 0.034012260233339926, "mean_token_accuracy": 0.7607361674308777, "num_tokens": 1896686.0, "step": 344, "train/ce_loss": 0.6032302379608154 }, { "epoch": 0.034012260233339926, "step": 344, "train/sim_loss": 0.0625 }, { "epoch": 0.034012260233339926, "step": 344, "train/total_loss": 0.12282302975654602 }, { "entropy": 9.243063926696777, "epoch": 0.03411113308285545, "mean_token_accuracy": 0.8015435338020325, "num_tokens": 1902225.0, "step": 345, "train/ce_loss": 0.6574256420135498 }, { "epoch": 0.03411113308285545, "step": 345, "train/sim_loss": 0.0703125 }, { "epoch": 0.03411113308285545, "step": 345, "train/total_loss": 0.13605506718158722 }, { "entropy": 9.604881286621094, "epoch": 0.034210005932370974, "mean_token_accuracy": 0.7260677218437195, "num_tokens": 1907536.0, "step": 346, "train/ce_loss": 0.7008615136146545 }, { "epoch": 0.034210005932370974, "step": 346, "train/sim_loss": 0.12109375 }, { "epoch": 0.034210005932370974, "step": 346, "train/total_loss": 0.19117990136146545 }, { "entropy": 9.267822265625, "epoch": 0.034308878781886494, "mean_token_accuracy": 0.7093712687492371, "num_tokens": 1912956.0, "step": 347, "train/ce_loss": 0.8986371755599976 }, { "epoch": 0.034308878781886494, "step": 347, "train/sim_loss": 0.12890625 }, { "epoch": 0.034308878781886494, "step": 347, "train/total_loss": 0.21876996755599976 }, { "entropy": 9.50816822052002, "epoch": 0.034407751631402014, "mean_token_accuracy": 0.7076326012611389, "num_tokens": 1918531.0, "step": 348, "train/ce_loss": 1.119275689125061 }, { "epoch": 0.034407751631402014, "step": 348, "train/sim_loss": 0.1171875 }, { "epoch": 0.034407751631402014, "step": 348, "train/total_loss": 0.2291150689125061 }, { "entropy": 9.30505084991455, "epoch": 0.03450662448091754, "mean_token_accuracy": 0.7331887483596802, "num_tokens": 1923994.0, "step": 349, "train/ce_loss": 0.519109308719635 }, { "epoch": 0.03450662448091754, "step": 349, "train/sim_loss": 0.05078125 }, { "epoch": 0.03450662448091754, "step": 349, "train/total_loss": 0.10269218683242798 }, { "entropy": 9.224017143249512, "epoch": 0.03460549733043306, "mean_token_accuracy": 0.7531718611717224, "num_tokens": 1929485.0, "step": 350, "train/ce_loss": 0.6781836748123169 }, { "epoch": 0.03460549733043306, "step": 350, "train/sim_loss": 0.06640625 }, { "epoch": 0.03460549733043306, "step": 350, "train/total_loss": 0.13422462344169617 }, { "entropy": 9.472253799438477, "epoch": 0.03470437017994859, "mean_token_accuracy": 0.7405140995979309, "num_tokens": 1934831.0, "step": 351, "train/ce_loss": 0.5664555430412292 }, { "epoch": 0.03470437017994859, "step": 351, "train/sim_loss": 0.04296875 }, { "epoch": 0.03470437017994859, "step": 351, "train/total_loss": 0.09961430728435516 }, { "entropy": 9.080623626708984, "epoch": 0.03480324302946411, "mean_token_accuracy": 0.6801579594612122, "num_tokens": 1940415.0, "step": 352, "train/ce_loss": 2.0751354694366455 }, { "epoch": 0.03480324302946411, "step": 352, "train/sim_loss": 0.125 }, { "epoch": 0.03480324302946411, "step": 352, "train/total_loss": 0.33251357078552246 }, { "entropy": 9.21993637084961, "epoch": 0.03490211587897963, "mean_token_accuracy": 0.7171398401260376, "num_tokens": 1945914.0, "step": 353, "train/ce_loss": 0.5155178308486938 }, { "epoch": 0.03490211587897963, "step": 353, "train/sim_loss": 0.08984375 }, { "epoch": 0.03490211587897963, "step": 353, "train/total_loss": 0.14139553904533386 }, { "entropy": 9.187665939331055, "epoch": 0.03500098872849516, "mean_token_accuracy": 0.7733050584793091, "num_tokens": 1951402.0, "step": 354, "train/ce_loss": 0.768223226070404 }, { "epoch": 0.03500098872849516, "step": 354, "train/sim_loss": 0.125 }, { "epoch": 0.03500098872849516, "step": 354, "train/total_loss": 0.20182232558727264 }, { "entropy": 9.198156356811523, "epoch": 0.03509986157801068, "mean_token_accuracy": 0.778372585773468, "num_tokens": 1956970.0, "step": 355, "train/ce_loss": 0.8561949133872986 }, { "epoch": 0.03509986157801068, "step": 355, "train/sim_loss": 0.0859375 }, { "epoch": 0.03509986157801068, "step": 355, "train/total_loss": 0.1715569943189621 }, { "entropy": 9.409862518310547, "epoch": 0.035198734427526204, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 1962442.0, "step": 356, "train/ce_loss": 0.7296246290206909 }, { "epoch": 0.035198734427526204, "step": 356, "train/sim_loss": 0.13671875 }, { "epoch": 0.035198734427526204, "step": 356, "train/total_loss": 0.2096812129020691 }, { "entropy": 9.488100051879883, "epoch": 0.035297607277041725, "mean_token_accuracy": 0.7160647511482239, "num_tokens": 1967849.0, "step": 357, "train/ce_loss": 0.666261613368988 }, { "epoch": 0.035297607277041725, "step": 357, "train/sim_loss": 0.203125 }, { "epoch": 0.035297607277041725, "step": 357, "train/total_loss": 0.2697511613368988 }, { "entropy": 8.638545036315918, "epoch": 0.035396480126557245, "mean_token_accuracy": 0.7121848464012146, "num_tokens": 1973813.0, "step": 358, "train/ce_loss": 0.3530464470386505 }, { "epoch": 0.035396480126557245, "step": 358, "train/sim_loss": 0.1015625 }, { "epoch": 0.035396480126557245, "step": 358, "train/total_loss": 0.13686715066432953 }, { "entropy": 9.04550552368164, "epoch": 0.03549535297607277, "mean_token_accuracy": 0.6963667869567871, "num_tokens": 1979558.0, "step": 359, "train/ce_loss": 1.8127251863479614 }, { "epoch": 0.03549535297607277, "step": 359, "train/sim_loss": 0.08984375 }, { "epoch": 0.03549535297607277, "step": 359, "train/total_loss": 0.2711162567138672 }, { "epoch": 0.03559422582558829, "grad_norm": 1.3116391897201538, "learning_rate": 9.913712109973793e-06, "loss": 0.2071, "step": 360 }, { "entropy": 9.215813636779785, "epoch": 0.03559422582558829, "mean_token_accuracy": 0.7419725060462952, "num_tokens": 1985115.0, "step": 360, "train/ce_loss": 0.8974589705467224 }, { "epoch": 0.03559422582558829, "step": 360, "train/sim_loss": 0.1875 }, { "epoch": 0.03559422582558829, "step": 360, "train/total_loss": 0.2772459089756012 }, { "entropy": 9.295516014099121, "epoch": 0.03569309867510382, "mean_token_accuracy": 0.7881448864936829, "num_tokens": 1990614.0, "step": 361, "train/ce_loss": 0.6553469300270081 }, { "epoch": 0.03569309867510382, "step": 361, "train/sim_loss": 0.12109375 }, { "epoch": 0.03569309867510382, "step": 361, "train/total_loss": 0.18662844598293304 }, { "entropy": 9.541372299194336, "epoch": 0.03579197152461934, "mean_token_accuracy": 0.718482255935669, "num_tokens": 1996071.0, "step": 362, "train/ce_loss": 0.8680171966552734 }, { "epoch": 0.03579197152461934, "step": 362, "train/sim_loss": 0.1015625 }, { "epoch": 0.03579197152461934, "step": 362, "train/total_loss": 0.18836422264575958 }, { "entropy": 9.153349876403809, "epoch": 0.03589084437413486, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 2001726.0, "step": 363, "train/ce_loss": 0.9088106155395508 }, { "epoch": 0.03589084437413486, "step": 363, "train/sim_loss": 0.0703125 }, { "epoch": 0.03589084437413486, "step": 363, "train/total_loss": 0.16119356453418732 }, { "entropy": 9.049999237060547, "epoch": 0.03598971722365039, "mean_token_accuracy": 0.7309941649436951, "num_tokens": 2007257.0, "step": 364, "train/ce_loss": 0.5734365582466125 }, { "epoch": 0.03598971722365039, "step": 364, "train/sim_loss": 0.05078125 }, { "epoch": 0.03598971722365039, "step": 364, "train/total_loss": 0.10812491178512573 }, { "entropy": 9.129220008850098, "epoch": 0.03608859007316591, "mean_token_accuracy": 0.7347875833511353, "num_tokens": 2012780.0, "step": 365, "train/ce_loss": 0.8205778002738953 }, { "epoch": 0.03608859007316591, "step": 365, "train/sim_loss": 0.1015625 }, { "epoch": 0.03608859007316591, "step": 365, "train/total_loss": 0.18362027406692505 }, { "entropy": 9.69770622253418, "epoch": 0.036187462922681435, "mean_token_accuracy": 0.7116991877555847, "num_tokens": 2018280.0, "step": 366, "train/ce_loss": 1.162908911705017 }, { "epoch": 0.036187462922681435, "step": 366, "train/sim_loss": 0.12109375 }, { "epoch": 0.036187462922681435, "step": 366, "train/total_loss": 0.2373846471309662 }, { "entropy": 8.758979797363281, "epoch": 0.036286335772196955, "mean_token_accuracy": 0.7175463438034058, "num_tokens": 2024274.0, "step": 367, "train/ce_loss": 0.512734591960907 }, { "epoch": 0.036286335772196955, "step": 367, "train/sim_loss": 0.125 }, { "epoch": 0.036286335772196955, "step": 367, "train/total_loss": 0.17627346515655518 }, { "entropy": 9.459256172180176, "epoch": 0.036385208621712475, "mean_token_accuracy": 0.7685534358024597, "num_tokens": 2029730.0, "step": 368, "train/ce_loss": 1.01776921749115 }, { "epoch": 0.036385208621712475, "step": 368, "train/sim_loss": 0.1171875 }, { "epoch": 0.036385208621712475, "step": 368, "train/total_loss": 0.21896442770957947 }, { "entropy": 9.393678665161133, "epoch": 0.036484081471228, "mean_token_accuracy": 0.6691358089447021, "num_tokens": 2035134.0, "step": 369, "train/ce_loss": 2.1969802379608154 }, { "epoch": 0.036484081471228, "step": 369, "train/sim_loss": 0.19140625 }, { "epoch": 0.036484081471228, "step": 369, "train/total_loss": 0.4111042618751526 }, { "entropy": 9.222057342529297, "epoch": 0.03658295432074352, "mean_token_accuracy": 0.732342004776001, "num_tokens": 2040499.0, "step": 370, "train/ce_loss": 0.867620587348938 }, { "epoch": 0.03658295432074352, "step": 370, "train/sim_loss": 0.08984375 }, { "epoch": 0.03658295432074352, "step": 370, "train/total_loss": 0.17660582065582275 }, { "entropy": 9.177568435668945, "epoch": 0.03668182717025905, "mean_token_accuracy": 0.7789255976676941, "num_tokens": 2046062.0, "step": 371, "train/ce_loss": 0.574241578578949 }, { "epoch": 0.03668182717025905, "step": 371, "train/sim_loss": 0.171875 }, { "epoch": 0.03668182717025905, "step": 371, "train/total_loss": 0.2292991578578949 }, { "entropy": 9.401215553283691, "epoch": 0.03678070001977457, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 2051536.0, "step": 372, "train/ce_loss": 1.01215660572052 }, { "epoch": 0.03678070001977457, "step": 372, "train/sim_loss": 0.08203125 }, { "epoch": 0.03678070001977457, "step": 372, "train/total_loss": 0.183246910572052 }, { "entropy": 8.883334159851074, "epoch": 0.03687957286929009, "mean_token_accuracy": 0.791556715965271, "num_tokens": 2057309.0, "step": 373, "train/ce_loss": 0.6467971205711365 }, { "epoch": 0.03687957286929009, "step": 373, "train/sim_loss": 0.125 }, { "epoch": 0.03687957286929009, "step": 373, "train/total_loss": 0.18967971205711365 }, { "entropy": 9.193597793579102, "epoch": 0.03697844571880562, "mean_token_accuracy": 0.7833537459373474, "num_tokens": 2062734.0, "step": 374, "train/ce_loss": 0.5633342266082764 }, { "epoch": 0.03697844571880562, "step": 374, "train/sim_loss": 0.16796875 }, { "epoch": 0.03697844571880562, "step": 374, "train/total_loss": 0.22430217266082764 }, { "entropy": 9.372347831726074, "epoch": 0.03707731856832114, "mean_token_accuracy": 0.7141148447990417, "num_tokens": 2068206.0, "step": 375, "train/ce_loss": 0.6887407898902893 }, { "epoch": 0.03707731856832114, "step": 375, "train/sim_loss": 0.15234375 }, { "epoch": 0.03707731856832114, "step": 375, "train/total_loss": 0.22121784090995789 }, { "entropy": 9.20608901977539, "epoch": 0.037176191417836665, "mean_token_accuracy": 0.7210526466369629, "num_tokens": 2073795.0, "step": 376, "train/ce_loss": 1.4395287036895752 }, { "epoch": 0.037176191417836665, "step": 376, "train/sim_loss": 0.203125 }, { "epoch": 0.037176191417836665, "step": 376, "train/total_loss": 0.347077876329422 }, { "entropy": 9.420259475708008, "epoch": 0.037275064267352186, "mean_token_accuracy": 0.7799510955810547, "num_tokens": 2079259.0, "step": 377, "train/ce_loss": 0.5549067258834839 }, { "epoch": 0.037275064267352186, "step": 377, "train/sim_loss": 0.08203125 }, { "epoch": 0.037275064267352186, "step": 377, "train/total_loss": 0.1375219225883484 }, { "entropy": 9.335251808166504, "epoch": 0.037373937116867706, "mean_token_accuracy": 0.7623220086097717, "num_tokens": 2084663.0, "step": 378, "train/ce_loss": 0.9098787307739258 }, { "epoch": 0.037373937116867706, "step": 378, "train/sim_loss": 0.0703125 }, { "epoch": 0.037373937116867706, "step": 378, "train/total_loss": 0.16130037605762482 }, { "entropy": 9.219718933105469, "epoch": 0.03747280996638323, "mean_token_accuracy": 0.8091428279876709, "num_tokens": 2090150.0, "step": 379, "train/ce_loss": 1.157272219657898 }, { "epoch": 0.03747280996638323, "step": 379, "train/sim_loss": 0.16015625 }, { "epoch": 0.03747280996638323, "step": 379, "train/total_loss": 0.2758834660053253 }, { "epoch": 0.03757168281589875, "grad_norm": 1.2466548681259155, "learning_rate": 9.908767245215844e-06, "loss": 0.1986, "step": 380 }, { "entropy": 9.398508071899414, "epoch": 0.03757168281589875, "mean_token_accuracy": 0.7402439117431641, "num_tokens": 2095536.0, "step": 380, "train/ce_loss": 1.029214859008789 }, { "epoch": 0.03757168281589875, "step": 380, "train/sim_loss": 0.12890625 }, { "epoch": 0.03757168281589875, "step": 380, "train/total_loss": 0.2318277359008789 }, { "entropy": 8.858792304992676, "epoch": 0.03767055566541428, "mean_token_accuracy": 0.7365728616714478, "num_tokens": 2101305.0, "step": 381, "train/ce_loss": 1.796033501625061 }, { "epoch": 0.03767055566541428, "step": 381, "train/sim_loss": 0.08203125 }, { "epoch": 0.03767055566541428, "step": 381, "train/total_loss": 0.26163458824157715 }, { "entropy": 9.060490608215332, "epoch": 0.0377694285149298, "mean_token_accuracy": 0.7103064060211182, "num_tokens": 2106985.0, "step": 382, "train/ce_loss": 1.1970797777175903 }, { "epoch": 0.0377694285149298, "step": 382, "train/sim_loss": 0.1171875 }, { "epoch": 0.0377694285149298, "step": 382, "train/total_loss": 0.23689547181129456 }, { "entropy": 8.974237442016602, "epoch": 0.03786830136444532, "mean_token_accuracy": 0.7482078671455383, "num_tokens": 2112636.0, "step": 383, "train/ce_loss": 0.7933669090270996 }, { "epoch": 0.03786830136444532, "step": 383, "train/sim_loss": 0.1015625 }, { "epoch": 0.03786830136444532, "step": 383, "train/total_loss": 0.18089920282363892 }, { "entropy": 9.512754440307617, "epoch": 0.03796717421396085, "mean_token_accuracy": 0.7166246771812439, "num_tokens": 2117982.0, "step": 384, "train/ce_loss": 1.0452443361282349 }, { "epoch": 0.03796717421396085, "step": 384, "train/sim_loss": 0.1328125 }, { "epoch": 0.03796717421396085, "step": 384, "train/total_loss": 0.2373369336128235 }, { "entropy": 8.73411750793457, "epoch": 0.03806604706347637, "mean_token_accuracy": 0.7399486899375916, "num_tokens": 2123788.0, "step": 385, "train/ce_loss": 0.6257913708686829 }, { "epoch": 0.03806604706347637, "step": 385, "train/sim_loss": 0.11328125 }, { "epoch": 0.03806604706347637, "step": 385, "train/total_loss": 0.17586039006710052 }, { "entropy": 9.387861251831055, "epoch": 0.03816491991299189, "mean_token_accuracy": 0.7476076483726501, "num_tokens": 2129229.0, "step": 386, "train/ce_loss": 0.47010332345962524 }, { "epoch": 0.03816491991299189, "step": 386, "train/sim_loss": 0.0859375 }, { "epoch": 0.03816491991299189, "step": 386, "train/total_loss": 0.13294783234596252 }, { "entropy": 9.584901809692383, "epoch": 0.038263792762507416, "mean_token_accuracy": 0.7490445971488953, "num_tokens": 2134573.0, "step": 387, "train/ce_loss": 0.8507129549980164 }, { "epoch": 0.038263792762507416, "step": 387, "train/sim_loss": 0.09375 }, { "epoch": 0.038263792762507416, "step": 387, "train/total_loss": 0.17882129549980164 }, { "entropy": 9.32851505279541, "epoch": 0.038362665612022936, "mean_token_accuracy": 0.7480226159095764, "num_tokens": 2140111.0, "step": 388, "train/ce_loss": 0.9332478642463684 }, { "epoch": 0.038362665612022936, "step": 388, "train/sim_loss": 0.07421875 }, { "epoch": 0.038362665612022936, "step": 388, "train/total_loss": 0.16754353046417236 }, { "entropy": 9.09821891784668, "epoch": 0.038461538461538464, "mean_token_accuracy": 0.7668886780738831, "num_tokens": 2145767.0, "step": 389, "train/ce_loss": 1.1549656391143799 }, { "epoch": 0.038461538461538464, "step": 389, "train/sim_loss": 0.171875 }, { "epoch": 0.038461538461538464, "step": 389, "train/total_loss": 0.28737157583236694 }, { "entropy": 9.204479217529297, "epoch": 0.038560411311053984, "mean_token_accuracy": 0.7132419943809509, "num_tokens": 2151448.0, "step": 390, "train/ce_loss": 0.3778284192085266 }, { "epoch": 0.038560411311053984, "step": 390, "train/sim_loss": 0.12890625 }, { "epoch": 0.038560411311053984, "step": 390, "train/total_loss": 0.16668909788131714 }, { "entropy": 9.56042766571045, "epoch": 0.038659284160569504, "mean_token_accuracy": 0.7457627058029175, "num_tokens": 2156807.0, "step": 391, "train/ce_loss": 0.8824957013130188 }, { "epoch": 0.038659284160569504, "step": 391, "train/sim_loss": 0.09765625 }, { "epoch": 0.038659284160569504, "step": 391, "train/total_loss": 0.1859058141708374 }, { "entropy": 8.92952823638916, "epoch": 0.03875815701008503, "mean_token_accuracy": 0.7222222089767456, "num_tokens": 2162604.0, "step": 392, "train/ce_loss": 1.328771710395813 }, { "epoch": 0.03875815701008503, "step": 392, "train/sim_loss": 0.1015625 }, { "epoch": 0.03875815701008503, "step": 392, "train/total_loss": 0.2344396710395813 }, { "entropy": 9.122282981872559, "epoch": 0.03885702985960055, "mean_token_accuracy": 0.7172011733055115, "num_tokens": 2168248.0, "step": 393, "train/ce_loss": 0.6065725088119507 }, { "epoch": 0.03885702985960055, "step": 393, "train/sim_loss": 0.04296875 }, { "epoch": 0.03885702985960055, "step": 393, "train/total_loss": 0.10362599790096283 }, { "entropy": 9.444314002990723, "epoch": 0.03895590270911608, "mean_token_accuracy": 0.7429577708244324, "num_tokens": 2173655.0, "step": 394, "train/ce_loss": 1.7077223062515259 }, { "epoch": 0.03895590270911608, "step": 394, "train/sim_loss": 0.1484375 }, { "epoch": 0.03895590270911608, "step": 394, "train/total_loss": 0.3192097544670105 }, { "entropy": 9.390140533447266, "epoch": 0.0390547755586316, "mean_token_accuracy": 0.7786697149276733, "num_tokens": 2179063.0, "step": 395, "train/ce_loss": 0.5688561797142029 }, { "epoch": 0.0390547755586316, "step": 395, "train/sim_loss": 0.171875 }, { "epoch": 0.0390547755586316, "step": 395, "train/total_loss": 0.22876061499118805 }, { "entropy": 9.469959259033203, "epoch": 0.03915364840814712, "mean_token_accuracy": 0.7852112650871277, "num_tokens": 2184517.0, "step": 396, "train/ce_loss": 1.0313953161239624 }, { "epoch": 0.03915364840814712, "step": 396, "train/sim_loss": 0.140625 }, { "epoch": 0.03915364840814712, "step": 396, "train/total_loss": 0.24376453459262848 }, { "entropy": 9.264622688293457, "epoch": 0.03925252125766265, "mean_token_accuracy": 0.6912087798118591, "num_tokens": 2190022.0, "step": 397, "train/ce_loss": 1.024153709411621 }, { "epoch": 0.03925252125766265, "step": 397, "train/sim_loss": 0.17578125 }, { "epoch": 0.03925252125766265, "step": 397, "train/total_loss": 0.27819663286209106 }, { "entropy": 9.20025634765625, "epoch": 0.03935139410717817, "mean_token_accuracy": 0.6918172240257263, "num_tokens": 2195586.0, "step": 398, "train/ce_loss": 1.3428899049758911 }, { "epoch": 0.03935139410717817, "step": 398, "train/sim_loss": 0.10546875 }, { "epoch": 0.03935139410717817, "step": 398, "train/total_loss": 0.2397577464580536 }, { "entropy": 9.385490417480469, "epoch": 0.039450266956693694, "mean_token_accuracy": 0.7073732614517212, "num_tokens": 2201048.0, "step": 399, "train/ce_loss": 1.0356072187423706 }, { "epoch": 0.039450266956693694, "step": 399, "train/sim_loss": 0.15625 }, { "epoch": 0.039450266956693694, "step": 399, "train/total_loss": 0.2598107159137726 }, { "epoch": 0.039549139806209214, "grad_norm": 1.44027578830719, "learning_rate": 9.903822380457894e-06, "loss": 0.2067, "step": 400 }, { "entropy": 8.854276657104492, "epoch": 0.039549139806209214, "mean_token_accuracy": 0.7048558592796326, "num_tokens": 2206852.0, "step": 400, "train/ce_loss": 1.3599828481674194 }, { "epoch": 0.039549139806209214, "step": 400, "train/sim_loss": 0.09375 }, { "epoch": 0.039549139806209214, "step": 400, "train/total_loss": 0.22974829375743866 }, { "entropy": 9.500508308410645, "epoch": 0.039648012655724735, "mean_token_accuracy": 0.7023809552192688, "num_tokens": 2212301.0, "step": 401, "train/ce_loss": 1.3288360834121704 }, { "epoch": 0.039648012655724735, "step": 401, "train/sim_loss": 0.18359375 }, { "epoch": 0.039648012655724735, "step": 401, "train/total_loss": 0.31647735834121704 }, { "entropy": 9.283777236938477, "epoch": 0.03974688550524026, "mean_token_accuracy": 0.7420381903648376, "num_tokens": 2217835.0, "step": 402, "train/ce_loss": 0.498300164937973 }, { "epoch": 0.03974688550524026, "step": 402, "train/sim_loss": 0.1328125 }, { "epoch": 0.03974688550524026, "step": 402, "train/total_loss": 0.18264251947402954 }, { "entropy": 9.570509910583496, "epoch": 0.03984575835475578, "mean_token_accuracy": 0.7653061151504517, "num_tokens": 2223365.0, "step": 403, "train/ce_loss": 0.5797458291053772 }, { "epoch": 0.03984575835475578, "step": 403, "train/sim_loss": 0.1015625 }, { "epoch": 0.03984575835475578, "step": 403, "train/total_loss": 0.15953707695007324 }, { "entropy": 9.78123664855957, "epoch": 0.03994463120427131, "mean_token_accuracy": 0.7438016533851624, "num_tokens": 2228705.0, "step": 404, "train/ce_loss": 0.7913654446601868 }, { "epoch": 0.03994463120427131, "step": 404, "train/sim_loss": 0.11328125 }, { "epoch": 0.03994463120427131, "step": 404, "train/total_loss": 0.19241780042648315 }, { "entropy": 9.399012565612793, "epoch": 0.04004350405378683, "mean_token_accuracy": 0.7081544995307922, "num_tokens": 2234190.0, "step": 405, "train/ce_loss": 0.6734864711761475 }, { "epoch": 0.04004350405378683, "step": 405, "train/sim_loss": 0.078125 }, { "epoch": 0.04004350405378683, "step": 405, "train/total_loss": 0.1454736590385437 }, { "entropy": 8.969768524169922, "epoch": 0.04014237690330235, "mean_token_accuracy": 0.7581818103790283, "num_tokens": 2239948.0, "step": 406, "train/ce_loss": 0.5460231304168701 }, { "epoch": 0.04014237690330235, "step": 406, "train/sim_loss": 0.15234375 }, { "epoch": 0.04014237690330235, "step": 406, "train/total_loss": 0.20694606006145477 }, { "entropy": 9.118574142456055, "epoch": 0.04024124975281788, "mean_token_accuracy": 0.8057143092155457, "num_tokens": 2245505.0, "step": 407, "train/ce_loss": 0.4696980118751526 }, { "epoch": 0.04024124975281788, "step": 407, "train/sim_loss": 0.05859375 }, { "epoch": 0.04024124975281788, "step": 407, "train/total_loss": 0.10556355118751526 }, { "entropy": 9.484020233154297, "epoch": 0.0403401226023334, "mean_token_accuracy": 0.7580437660217285, "num_tokens": 2250983.0, "step": 408, "train/ce_loss": 1.0065193176269531 }, { "epoch": 0.0403401226023334, "step": 408, "train/sim_loss": 0.078125 }, { "epoch": 0.0403401226023334, "step": 408, "train/total_loss": 0.17877693474292755 }, { "entropy": 9.476762771606445, "epoch": 0.040438995451848925, "mean_token_accuracy": 0.7148058414459229, "num_tokens": 2256414.0, "step": 409, "train/ce_loss": 0.5776805877685547 }, { "epoch": 0.040438995451848925, "step": 409, "train/sim_loss": 0.171875 }, { "epoch": 0.040438995451848925, "step": 409, "train/total_loss": 0.2296430617570877 }, { "entropy": 9.406131744384766, "epoch": 0.040537868301364445, "mean_token_accuracy": 0.7703962922096252, "num_tokens": 2261873.0, "step": 410, "train/ce_loss": 0.6690560579299927 }, { "epoch": 0.040537868301364445, "step": 410, "train/sim_loss": 0.16796875 }, { "epoch": 0.040537868301364445, "step": 410, "train/total_loss": 0.23487436771392822 }, { "entropy": 9.187175750732422, "epoch": 0.040636741150879965, "mean_token_accuracy": 0.7309697866439819, "num_tokens": 2267472.0, "step": 411, "train/ce_loss": 0.7690040469169617 }, { "epoch": 0.040636741150879965, "step": 411, "train/sim_loss": 0.12109375 }, { "epoch": 0.040636741150879965, "step": 411, "train/total_loss": 0.1979941576719284 }, { "entropy": 9.245943069458008, "epoch": 0.04073561400039549, "mean_token_accuracy": 0.7621483206748962, "num_tokens": 2272841.0, "step": 412, "train/ce_loss": 0.9506880640983582 }, { "epoch": 0.04073561400039549, "step": 412, "train/sim_loss": 0.13671875 }, { "epoch": 0.04073561400039549, "step": 412, "train/total_loss": 0.2317875623703003 }, { "entropy": 9.440951347351074, "epoch": 0.04083448684991101, "mean_token_accuracy": 0.7714285850524902, "num_tokens": 2278345.0, "step": 413, "train/ce_loss": 0.9201148152351379 }, { "epoch": 0.04083448684991101, "step": 413, "train/sim_loss": 0.125 }, { "epoch": 0.04083448684991101, "step": 413, "train/total_loss": 0.2170114815235138 }, { "entropy": 9.274852752685547, "epoch": 0.04093335969942654, "mean_token_accuracy": 0.7066666483879089, "num_tokens": 2283991.0, "step": 414, "train/ce_loss": 1.041424036026001 }, { "epoch": 0.04093335969942654, "step": 414, "train/sim_loss": 0.09375 }, { "epoch": 0.04093335969942654, "step": 414, "train/total_loss": 0.19789239764213562 }, { "entropy": 9.659711837768555, "epoch": 0.04103223254894206, "mean_token_accuracy": 0.7620320916175842, "num_tokens": 2289292.0, "step": 415, "train/ce_loss": 0.5996301770210266 }, { "epoch": 0.04103223254894206, "step": 415, "train/sim_loss": 0.10546875 }, { "epoch": 0.04103223254894206, "step": 415, "train/total_loss": 0.16543176770210266 }, { "entropy": 9.287007331848145, "epoch": 0.04113110539845758, "mean_token_accuracy": 0.6921465992927551, "num_tokens": 2294847.0, "step": 416, "train/ce_loss": 1.3126076459884644 }, { "epoch": 0.04113110539845758, "step": 416, "train/sim_loss": 0.11328125 }, { "epoch": 0.04113110539845758, "step": 416, "train/total_loss": 0.24454201757907867 }, { "entropy": 9.550548553466797, "epoch": 0.04122997824797311, "mean_token_accuracy": 0.750316858291626, "num_tokens": 2300213.0, "step": 417, "train/ce_loss": 0.7415802478790283 }, { "epoch": 0.04122997824797311, "step": 417, "train/sim_loss": 0.04296875 }, { "epoch": 0.04122997824797311, "step": 417, "train/total_loss": 0.11712677776813507 }, { "entropy": 9.15288257598877, "epoch": 0.04132885109748863, "mean_token_accuracy": 0.7952917218208313, "num_tokens": 2305813.0, "step": 418, "train/ce_loss": 0.6790967583656311 }, { "epoch": 0.04132885109748863, "step": 418, "train/sim_loss": 0.04296875 }, { "epoch": 0.04132885109748863, "step": 418, "train/total_loss": 0.11087843030691147 }, { "entropy": 8.780216217041016, "epoch": 0.041427723947004155, "mean_token_accuracy": 0.7415981888771057, "num_tokens": 2311741.0, "step": 419, "train/ce_loss": 0.7325243949890137 }, { "epoch": 0.041427723947004155, "step": 419, "train/sim_loss": 0.10546875 }, { "epoch": 0.041427723947004155, "step": 419, "train/total_loss": 0.17872118949890137 }, { "epoch": 0.041526596796519676, "grad_norm": 1.0192052125930786, "learning_rate": 9.898877515699947e-06, "loss": 0.1999, "step": 420 }, { "entropy": 9.499436378479004, "epoch": 0.041526596796519676, "mean_token_accuracy": 0.7042253613471985, "num_tokens": 2317167.0, "step": 420, "train/ce_loss": 1.0952421426773071 }, { "epoch": 0.041526596796519676, "step": 420, "train/sim_loss": 0.09375 }, { "epoch": 0.041526596796519676, "step": 420, "train/total_loss": 0.2032742202281952 }, { "entropy": 9.280860900878906, "epoch": 0.041625469646035196, "mean_token_accuracy": 0.7468926310539246, "num_tokens": 2322739.0, "step": 421, "train/ce_loss": 0.7982376217842102 }, { "epoch": 0.041625469646035196, "step": 421, "train/sim_loss": 0.0625 }, { "epoch": 0.041625469646035196, "step": 421, "train/total_loss": 0.14232376217842102 }, { "entropy": 9.420942306518555, "epoch": 0.04172434249555072, "mean_token_accuracy": 0.7351778745651245, "num_tokens": 2328188.0, "step": 422, "train/ce_loss": 0.6688096523284912 }, { "epoch": 0.04172434249555072, "step": 422, "train/sim_loss": 0.1328125 }, { "epoch": 0.04172434249555072, "step": 422, "train/total_loss": 0.1996934711933136 }, { "entropy": 9.48561954498291, "epoch": 0.04182321534506624, "mean_token_accuracy": 0.7132441997528076, "num_tokens": 2333662.0, "step": 423, "train/ce_loss": 0.9011238813400269 }, { "epoch": 0.04182321534506624, "step": 423, "train/sim_loss": 0.10546875 }, { "epoch": 0.04182321534506624, "step": 423, "train/total_loss": 0.19558113813400269 }, { "entropy": 9.305368423461914, "epoch": 0.04192208819458177, "mean_token_accuracy": 0.771584689617157, "num_tokens": 2339262.0, "step": 424, "train/ce_loss": 0.6388602256774902 }, { "epoch": 0.04192208819458177, "step": 424, "train/sim_loss": 0.0859375 }, { "epoch": 0.04192208819458177, "step": 424, "train/total_loss": 0.14982351660728455 }, { "entropy": 9.4307861328125, "epoch": 0.04202096104409729, "mean_token_accuracy": 0.7044711112976074, "num_tokens": 2344805.0, "step": 425, "train/ce_loss": 1.1260986328125 }, { "epoch": 0.04202096104409729, "step": 425, "train/sim_loss": 0.109375 }, { "epoch": 0.04202096104409729, "step": 425, "train/total_loss": 0.22198486328125 }, { "entropy": 9.727523803710938, "epoch": 0.04211983389361281, "mean_token_accuracy": 0.7357954382896423, "num_tokens": 2350053.0, "step": 426, "train/ce_loss": 0.6425698399543762 }, { "epoch": 0.04211983389361281, "step": 426, "train/sim_loss": 0.0546875 }, { "epoch": 0.04211983389361281, "step": 426, "train/total_loss": 0.11894448846578598 }, { "entropy": 9.625909805297852, "epoch": 0.04221870674312834, "mean_token_accuracy": 0.6917808055877686, "num_tokens": 2355376.0, "step": 427, "train/ce_loss": 1.00179922580719 }, { "epoch": 0.04221870674312834, "step": 427, "train/sim_loss": 0.12890625 }, { "epoch": 0.04221870674312834, "step": 427, "train/total_loss": 0.22908617556095123 }, { "entropy": 9.553486824035645, "epoch": 0.04231757959264386, "mean_token_accuracy": 0.713350772857666, "num_tokens": 2360707.0, "step": 428, "train/ce_loss": 1.0905821323394775 }, { "epoch": 0.04231757959264386, "step": 428, "train/sim_loss": 0.125 }, { "epoch": 0.04231757959264386, "step": 428, "train/total_loss": 0.23405821621418 }, { "entropy": 9.579269409179688, "epoch": 0.042416452442159386, "mean_token_accuracy": 0.7418032884597778, "num_tokens": 2366085.0, "step": 429, "train/ce_loss": 0.7459746599197388 }, { "epoch": 0.042416452442159386, "step": 429, "train/sim_loss": 0.171875 }, { "epoch": 0.042416452442159386, "step": 429, "train/total_loss": 0.24647247791290283 }, { "entropy": 9.128000259399414, "epoch": 0.042515325291674906, "mean_token_accuracy": 0.6571754217147827, "num_tokens": 2371640.0, "step": 430, "train/ce_loss": 0.9993560314178467 }, { "epoch": 0.042515325291674906, "step": 430, "train/sim_loss": 0.0859375 }, { "epoch": 0.042515325291674906, "step": 430, "train/total_loss": 0.1858731061220169 }, { "entropy": 9.477059364318848, "epoch": 0.042614198141190426, "mean_token_accuracy": 0.6870324015617371, "num_tokens": 2377082.0, "step": 431, "train/ce_loss": 0.7993332743644714 }, { "epoch": 0.042614198141190426, "step": 431, "train/sim_loss": 0.0859375 }, { "epoch": 0.042614198141190426, "step": 431, "train/total_loss": 0.16587083041667938 }, { "entropy": 9.534008979797363, "epoch": 0.042713070990705954, "mean_token_accuracy": 0.7609511613845825, "num_tokens": 2382438.0, "step": 432, "train/ce_loss": 0.6172921657562256 }, { "epoch": 0.042713070990705954, "step": 432, "train/sim_loss": 0.12109375 }, { "epoch": 0.042713070990705954, "step": 432, "train/total_loss": 0.18282297253608704 }, { "entropy": 9.3564453125, "epoch": 0.042811943840221474, "mean_token_accuracy": 0.740359902381897, "num_tokens": 2387887.0, "step": 433, "train/ce_loss": 1.0650404691696167 }, { "epoch": 0.042811943840221474, "step": 433, "train/sim_loss": 0.1640625 }, { "epoch": 0.042811943840221474, "step": 433, "train/total_loss": 0.27056655287742615 }, { "entropy": 9.420473098754883, "epoch": 0.042910816689737, "mean_token_accuracy": 0.7165841460227966, "num_tokens": 2393283.0, "step": 434, "train/ce_loss": 1.0626318454742432 }, { "epoch": 0.042910816689737, "step": 434, "train/sim_loss": 0.1328125 }, { "epoch": 0.042910816689737, "step": 434, "train/total_loss": 0.2390756905078888 }, { "entropy": 9.37211799621582, "epoch": 0.04300968953925252, "mean_token_accuracy": 0.7787182331085205, "num_tokens": 2398653.0, "step": 435, "train/ce_loss": 0.6769257187843323 }, { "epoch": 0.04300968953925252, "step": 435, "train/sim_loss": 0.15234375 }, { "epoch": 0.04300968953925252, "step": 435, "train/total_loss": 0.2200363278388977 }, { "entropy": 9.402790069580078, "epoch": 0.04310856238876804, "mean_token_accuracy": 0.7209039330482483, "num_tokens": 2404188.0, "step": 436, "train/ce_loss": 0.6056963205337524 }, { "epoch": 0.04310856238876804, "step": 436, "train/sim_loss": 0.109375 }, { "epoch": 0.04310856238876804, "step": 436, "train/total_loss": 0.169944629073143 }, { "entropy": 9.229952812194824, "epoch": 0.04320743523828357, "mean_token_accuracy": 0.7067307829856873, "num_tokens": 2409955.0, "step": 437, "train/ce_loss": 0.7020055651664734 }, { "epoch": 0.04320743523828357, "step": 437, "train/sim_loss": 0.05078125 }, { "epoch": 0.04320743523828357, "step": 437, "train/total_loss": 0.12098180502653122 }, { "entropy": 8.93527603149414, "epoch": 0.04330630808779909, "mean_token_accuracy": 0.7003179788589478, "num_tokens": 2415813.0, "step": 438, "train/ce_loss": 1.626790165901184 }, { "epoch": 0.04330630808779909, "step": 438, "train/sim_loss": 0.125 }, { "epoch": 0.04330630808779909, "step": 438, "train/total_loss": 0.2876790165901184 }, { "entropy": 9.465988159179688, "epoch": 0.043405180937314616, "mean_token_accuracy": 0.7111913561820984, "num_tokens": 2421242.0, "step": 439, "train/ce_loss": 1.030077576637268 }, { "epoch": 0.043405180937314616, "step": 439, "train/sim_loss": 0.0703125 }, { "epoch": 0.043405180937314616, "step": 439, "train/total_loss": 0.17332026362419128 }, { "epoch": 0.04350405378683014, "grad_norm": 1.6835906505584717, "learning_rate": 9.893932650941997e-06, "loss": 0.2138, "step": 440 }, { "entropy": 9.27851676940918, "epoch": 0.04350405378683014, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 2426812.0, "step": 440, "train/ce_loss": 0.7828482985496521 }, { "epoch": 0.04350405378683014, "step": 440, "train/sim_loss": 0.15234375 }, { "epoch": 0.04350405378683014, "step": 440, "train/total_loss": 0.2306285798549652 }, { "entropy": 9.615768432617188, "epoch": 0.04360292663634566, "mean_token_accuracy": 0.7277289628982544, "num_tokens": 2432213.0, "step": 441, "train/ce_loss": 1.0390163660049438 }, { "epoch": 0.04360292663634566, "step": 441, "train/sim_loss": 0.07421875 }, { "epoch": 0.04360292663634566, "step": 441, "train/total_loss": 0.17812038958072662 }, { "entropy": 9.345560073852539, "epoch": 0.043701799485861184, "mean_token_accuracy": 0.7102803587913513, "num_tokens": 2437708.0, "step": 442, "train/ce_loss": 1.05363929271698 }, { "epoch": 0.043701799485861184, "step": 442, "train/sim_loss": 0.13671875 }, { "epoch": 0.043701799485861184, "step": 442, "train/total_loss": 0.24208268523216248 }, { "entropy": 9.633979797363281, "epoch": 0.043800672335376704, "mean_token_accuracy": 0.7335058450698853, "num_tokens": 2443054.0, "step": 443, "train/ce_loss": 1.0353796482086182 }, { "epoch": 0.043800672335376704, "step": 443, "train/sim_loss": 0.09765625 }, { "epoch": 0.043800672335376704, "step": 443, "train/total_loss": 0.20119422674179077 }, { "entropy": 9.463415145874023, "epoch": 0.04389954518489223, "mean_token_accuracy": 0.69597989320755, "num_tokens": 2448529.0, "step": 444, "train/ce_loss": 0.7755256295204163 }, { "epoch": 0.04389954518489223, "step": 444, "train/sim_loss": 0.08984375 }, { "epoch": 0.04389954518489223, "step": 444, "train/total_loss": 0.16739630699157715 }, { "entropy": 8.97813606262207, "epoch": 0.04399841803440775, "mean_token_accuracy": 0.7677642703056335, "num_tokens": 2454271.0, "step": 445, "train/ce_loss": 0.4948018491268158 }, { "epoch": 0.04399841803440775, "step": 445, "train/sim_loss": 0.125 }, { "epoch": 0.04399841803440775, "step": 445, "train/total_loss": 0.17448018491268158 }, { "entropy": 9.303359031677246, "epoch": 0.04409729088392327, "mean_token_accuracy": 0.7164339423179626, "num_tokens": 2459863.0, "step": 446, "train/ce_loss": 0.6891971826553345 }, { "epoch": 0.04409729088392327, "step": 446, "train/sim_loss": 0.08203125 }, { "epoch": 0.04409729088392327, "step": 446, "train/total_loss": 0.15095096826553345 }, { "entropy": 9.174467086791992, "epoch": 0.0441961637334388, "mean_token_accuracy": 0.769319474697113, "num_tokens": 2465333.0, "step": 447, "train/ce_loss": 1.001265287399292 }, { "epoch": 0.0441961637334388, "step": 447, "train/sim_loss": 0.140625 }, { "epoch": 0.0441961637334388, "step": 447, "train/total_loss": 0.24075153470039368 }, { "entropy": 9.19871711730957, "epoch": 0.04429503658295432, "mean_token_accuracy": 0.7174392938613892, "num_tokens": 2470798.0, "step": 448, "train/ce_loss": 1.195844054222107 }, { "epoch": 0.04429503658295432, "step": 448, "train/sim_loss": 0.078125 }, { "epoch": 0.04429503658295432, "step": 448, "train/total_loss": 0.19770941138267517 }, { "entropy": 9.338459014892578, "epoch": 0.04439390943246985, "mean_token_accuracy": 0.6993534564971924, "num_tokens": 2476363.0, "step": 449, "train/ce_loss": 1.2230912446975708 }, { "epoch": 0.04439390943246985, "step": 449, "train/sim_loss": 0.1015625 }, { "epoch": 0.04439390943246985, "step": 449, "train/total_loss": 0.2238716185092926 }, { "entropy": 9.331859588623047, "epoch": 0.04449278228198537, "mean_token_accuracy": 0.7367841601371765, "num_tokens": 2481899.0, "step": 450, "train/ce_loss": 0.8085346221923828 }, { "epoch": 0.04449278228198537, "step": 450, "train/sim_loss": 0.0859375 }, { "epoch": 0.04449278228198537, "step": 450, "train/total_loss": 0.16679096221923828 }, { "entropy": 9.644416809082031, "epoch": 0.04459165513150089, "mean_token_accuracy": 0.7357051968574524, "num_tokens": 2487289.0, "step": 451, "train/ce_loss": 0.832320511341095 }, { "epoch": 0.04459165513150089, "step": 451, "train/sim_loss": 0.1484375 }, { "epoch": 0.04459165513150089, "step": 451, "train/total_loss": 0.23166954517364502 }, { "entropy": 9.524726867675781, "epoch": 0.044690527981016415, "mean_token_accuracy": 0.7263888716697693, "num_tokens": 2492650.0, "step": 452, "train/ce_loss": 0.714298665523529 }, { "epoch": 0.044690527981016415, "step": 452, "train/sim_loss": 0.15625 }, { "epoch": 0.044690527981016415, "step": 452, "train/total_loss": 0.22767987847328186 }, { "entropy": 9.176843643188477, "epoch": 0.044789400830531935, "mean_token_accuracy": 0.712284505367279, "num_tokens": 2498220.0, "step": 453, "train/ce_loss": 1.291991949081421 }, { "epoch": 0.044789400830531935, "step": 453, "train/sim_loss": 0.17578125 }, { "epoch": 0.044789400830531935, "step": 453, "train/total_loss": 0.30498045682907104 }, { "entropy": 9.196844100952148, "epoch": 0.04488827368004746, "mean_token_accuracy": 0.681664764881134, "num_tokens": 2503765.0, "step": 454, "train/ce_loss": 0.7363664507865906 }, { "epoch": 0.04488827368004746, "step": 454, "train/sim_loss": 0.14453125 }, { "epoch": 0.04488827368004746, "step": 454, "train/total_loss": 0.21816790103912354 }, { "entropy": 9.522351264953613, "epoch": 0.04498714652956298, "mean_token_accuracy": 0.672530472278595, "num_tokens": 2509177.0, "step": 455, "train/ce_loss": 0.8966784477233887 }, { "epoch": 0.04498714652956298, "step": 455, "train/sim_loss": 0.1171875 }, { "epoch": 0.04498714652956298, "step": 455, "train/total_loss": 0.20685535669326782 }, { "entropy": 9.545125961303711, "epoch": 0.0450860193790785, "mean_token_accuracy": 0.720588207244873, "num_tokens": 2514569.0, "step": 456, "train/ce_loss": 0.9301498532295227 }, { "epoch": 0.0450860193790785, "step": 456, "train/sim_loss": 0.19921875 }, { "epoch": 0.0450860193790785, "step": 456, "train/total_loss": 0.29223373532295227 }, { "entropy": 9.482542037963867, "epoch": 0.04518489222859403, "mean_token_accuracy": 0.7421307563781738, "num_tokens": 2520035.0, "step": 457, "train/ce_loss": 0.6170859932899475 }, { "epoch": 0.04518489222859403, "step": 457, "train/sim_loss": 0.05859375 }, { "epoch": 0.04518489222859403, "step": 457, "train/total_loss": 0.12030234932899475 }, { "entropy": 9.498896598815918, "epoch": 0.04528376507810955, "mean_token_accuracy": 0.7888040542602539, "num_tokens": 2525448.0, "step": 458, "train/ce_loss": 0.6067715883255005 }, { "epoch": 0.04528376507810955, "step": 458, "train/sim_loss": 0.14453125 }, { "epoch": 0.04528376507810955, "step": 458, "train/total_loss": 0.2052084058523178 }, { "entropy": 9.45312213897705, "epoch": 0.04538263792762508, "mean_token_accuracy": 0.7570422291755676, "num_tokens": 2531136.0, "step": 459, "train/ce_loss": 0.8193569779396057 }, { "epoch": 0.04538263792762508, "step": 459, "train/sim_loss": 0.12890625 }, { "epoch": 0.04538263792762508, "step": 459, "train/total_loss": 0.21084195375442505 }, { "epoch": 0.0454815107771406, "grad_norm": 1.547608733177185, "learning_rate": 9.88898778618405e-06, "loss": 0.2122, "step": 460 }, { "entropy": 9.33843994140625, "epoch": 0.0454815107771406, "mean_token_accuracy": 0.7338902354240417, "num_tokens": 2536644.0, "step": 460, "train/ce_loss": 1.1203594207763672 }, { "epoch": 0.0454815107771406, "step": 460, "train/sim_loss": 0.09765625 }, { "epoch": 0.0454815107771406, "step": 460, "train/total_loss": 0.20969219505786896 }, { "entropy": 9.677812576293945, "epoch": 0.04558038362665612, "mean_token_accuracy": 0.7240051627159119, "num_tokens": 2542036.0, "step": 461, "train/ce_loss": 1.2770155668258667 }, { "epoch": 0.04558038362665612, "step": 461, "train/sim_loss": 0.1640625 }, { "epoch": 0.04558038362665612, "step": 461, "train/total_loss": 0.2917640805244446 }, { "entropy": 9.004371643066406, "epoch": 0.045679256476171645, "mean_token_accuracy": 0.7064393758773804, "num_tokens": 2547810.0, "step": 462, "train/ce_loss": 0.9959540963172913 }, { "epoch": 0.045679256476171645, "step": 462, "train/sim_loss": 0.125 }, { "epoch": 0.045679256476171645, "step": 462, "train/total_loss": 0.22459541261196136 }, { "entropy": 9.274721145629883, "epoch": 0.045778129325687165, "mean_token_accuracy": 0.7813211679458618, "num_tokens": 2553330.0, "step": 463, "train/ce_loss": 1.101777195930481 }, { "epoch": 0.045778129325687165, "step": 463, "train/sim_loss": 0.12890625 }, { "epoch": 0.045778129325687165, "step": 463, "train/total_loss": 0.23908397555351257 }, { "entropy": 9.725322723388672, "epoch": 0.04587700217520269, "mean_token_accuracy": 0.7410179376602173, "num_tokens": 2558651.0, "step": 464, "train/ce_loss": 0.7252472043037415 }, { "epoch": 0.04587700217520269, "step": 464, "train/sim_loss": 0.12109375 }, { "epoch": 0.04587700217520269, "step": 464, "train/total_loss": 0.19361847639083862 }, { "entropy": 9.617179870605469, "epoch": 0.04597587502471821, "mean_token_accuracy": 0.7373868227005005, "num_tokens": 2563947.0, "step": 465, "train/ce_loss": 0.7710058689117432 }, { "epoch": 0.04597587502471821, "step": 465, "train/sim_loss": 0.0625 }, { "epoch": 0.04597587502471821, "step": 465, "train/total_loss": 0.13960058987140656 }, { "entropy": 9.625958442687988, "epoch": 0.04607474787423373, "mean_token_accuracy": 0.7770859003067017, "num_tokens": 2569360.0, "step": 466, "train/ce_loss": 0.8123289942741394 }, { "epoch": 0.04607474787423373, "step": 466, "train/sim_loss": 0.1328125 }, { "epoch": 0.04607474787423373, "step": 466, "train/total_loss": 0.21404540538787842 }, { "entropy": 9.51323127746582, "epoch": 0.04617362072374926, "mean_token_accuracy": 0.7824561595916748, "num_tokens": 2574790.0, "step": 467, "train/ce_loss": 1.2258681058883667 }, { "epoch": 0.04617362072374926, "step": 467, "train/sim_loss": 0.1875 }, { "epoch": 0.04617362072374926, "step": 467, "train/total_loss": 0.31008681654930115 }, { "entropy": 9.408411026000977, "epoch": 0.04627249357326478, "mean_token_accuracy": 0.7764182686805725, "num_tokens": 2580266.0, "step": 468, "train/ce_loss": 0.9417311549186707 }, { "epoch": 0.04627249357326478, "step": 468, "train/sim_loss": 0.109375 }, { "epoch": 0.04627249357326478, "step": 468, "train/total_loss": 0.2035481184720993 }, { "entropy": 9.265334129333496, "epoch": 0.04637136642278031, "mean_token_accuracy": 0.7210584282875061, "num_tokens": 2585836.0, "step": 469, "train/ce_loss": 1.0412238836288452 }, { "epoch": 0.04637136642278031, "step": 469, "train/sim_loss": 0.17578125 }, { "epoch": 0.04637136642278031, "step": 469, "train/total_loss": 0.2799036502838135 }, { "entropy": 9.33547306060791, "epoch": 0.04647023927229583, "mean_token_accuracy": 0.7086704969406128, "num_tokens": 2591322.0, "step": 470, "train/ce_loss": 1.0897380113601685 }, { "epoch": 0.04647023927229583, "step": 470, "train/sim_loss": 0.19140625 }, { "epoch": 0.04647023927229583, "step": 470, "train/total_loss": 0.30038005113601685 }, { "entropy": 9.323257446289062, "epoch": 0.04656911212181135, "mean_token_accuracy": 0.7400497794151306, "num_tokens": 2596708.0, "step": 471, "train/ce_loss": 0.7879549860954285 }, { "epoch": 0.04656911212181135, "step": 471, "train/sim_loss": 0.1015625 }, { "epoch": 0.04656911212181135, "step": 471, "train/total_loss": 0.18035799264907837 }, { "entropy": 9.69476318359375, "epoch": 0.046667984971326876, "mean_token_accuracy": 0.7456724643707275, "num_tokens": 2602230.0, "step": 472, "train/ce_loss": 1.060380220413208 }, { "epoch": 0.046667984971326876, "step": 472, "train/sim_loss": 0.11328125 }, { "epoch": 0.046667984971326876, "step": 472, "train/total_loss": 0.21931928396224976 }, { "entropy": 9.190179824829102, "epoch": 0.046766857820842396, "mean_token_accuracy": 0.7220447063446045, "num_tokens": 2607810.0, "step": 473, "train/ce_loss": 1.4433903694152832 }, { "epoch": 0.046766857820842396, "step": 473, "train/sim_loss": 0.14453125 }, { "epoch": 0.046766857820842396, "step": 473, "train/total_loss": 0.28887027502059937 }, { "entropy": 9.128694534301758, "epoch": 0.046865730670357916, "mean_token_accuracy": 0.6867891550064087, "num_tokens": 2613527.0, "step": 474, "train/ce_loss": 0.7497481107711792 }, { "epoch": 0.046865730670357916, "step": 474, "train/sim_loss": 0.09765625 }, { "epoch": 0.046865730670357916, "step": 474, "train/total_loss": 0.17263105511665344 }, { "entropy": 9.553457260131836, "epoch": 0.046964603519873444, "mean_token_accuracy": 0.7061855792999268, "num_tokens": 2618854.0, "step": 475, "train/ce_loss": 1.313547968864441 }, { "epoch": 0.046964603519873444, "step": 475, "train/sim_loss": 0.1015625 }, { "epoch": 0.046964603519873444, "step": 475, "train/total_loss": 0.23291729390621185 }, { "entropy": 9.370430946350098, "epoch": 0.047063476369388964, "mean_token_accuracy": 0.781283974647522, "num_tokens": 2624392.0, "step": 476, "train/ce_loss": 0.7777453064918518 }, { "epoch": 0.047063476369388964, "step": 476, "train/sim_loss": 0.125 }, { "epoch": 0.047063476369388964, "step": 476, "train/total_loss": 0.2027745246887207 }, { "entropy": 9.359167098999023, "epoch": 0.04716234921890449, "mean_token_accuracy": 0.7134955525398254, "num_tokens": 2629861.0, "step": 477, "train/ce_loss": 0.8248905539512634 }, { "epoch": 0.04716234921890449, "step": 477, "train/sim_loss": 0.1328125 }, { "epoch": 0.04716234921890449, "step": 477, "train/total_loss": 0.21530155837535858 }, { "entropy": 9.476511001586914, "epoch": 0.04726122206842001, "mean_token_accuracy": 0.7432273030281067, "num_tokens": 2635304.0, "step": 478, "train/ce_loss": 0.7413972020149231 }, { "epoch": 0.04726122206842001, "step": 478, "train/sim_loss": 0.0546875 }, { "epoch": 0.04726122206842001, "step": 478, "train/total_loss": 0.12882721424102783 }, { "entropy": 9.229567527770996, "epoch": 0.04736009491793553, "mean_token_accuracy": 0.7433366179466248, "num_tokens": 2640907.0, "step": 479, "train/ce_loss": 0.7778656482696533 }, { "epoch": 0.04736009491793553, "step": 479, "train/sim_loss": 0.12109375 }, { "epoch": 0.04736009491793553, "step": 479, "train/total_loss": 0.19888031482696533 }, { "epoch": 0.04745896776745106, "grad_norm": 1.2745909690856934, "learning_rate": 9.8840429214261e-06, "loss": 0.2022, "step": 480 }, { "entropy": 9.359058380126953, "epoch": 0.04745896776745106, "mean_token_accuracy": 0.7475994229316711, "num_tokens": 2646203.0, "step": 480, "train/ce_loss": 0.6546791791915894 }, { "epoch": 0.04745896776745106, "step": 480, "train/sim_loss": 0.046875 }, { "epoch": 0.04745896776745106, "step": 480, "train/total_loss": 0.11234291642904282 }, { "entropy": 9.308222770690918, "epoch": 0.04755784061696658, "mean_token_accuracy": 0.7924731373786926, "num_tokens": 2651742.0, "step": 481, "train/ce_loss": 0.7929866909980774 }, { "epoch": 0.04755784061696658, "step": 481, "train/sim_loss": 0.05078125 }, { "epoch": 0.04755784061696658, "step": 481, "train/total_loss": 0.13007992506027222 }, { "entropy": 9.799203872680664, "epoch": 0.047656713466482106, "mean_token_accuracy": 0.743145763874054, "num_tokens": 2656977.0, "step": 482, "train/ce_loss": 0.8029451966285706 }, { "epoch": 0.047656713466482106, "step": 482, "train/sim_loss": 0.078125 }, { "epoch": 0.047656713466482106, "step": 482, "train/total_loss": 0.15841951966285706 }, { "entropy": 9.073314666748047, "epoch": 0.04775558631599763, "mean_token_accuracy": 0.7052730917930603, "num_tokens": 2662568.0, "step": 483, "train/ce_loss": 0.704506516456604 }, { "epoch": 0.04775558631599763, "step": 483, "train/sim_loss": 0.05078125 }, { "epoch": 0.04775558631599763, "step": 483, "train/total_loss": 0.12123190611600876 }, { "entropy": 9.243854522705078, "epoch": 0.04785445916551315, "mean_token_accuracy": 0.7154929637908936, "num_tokens": 2668178.0, "step": 484, "train/ce_loss": 0.6186106204986572 }, { "epoch": 0.04785445916551315, "step": 484, "train/sim_loss": 0.09375 }, { "epoch": 0.04785445916551315, "step": 484, "train/total_loss": 0.1556110680103302 }, { "entropy": 9.14967155456543, "epoch": 0.047953332015028674, "mean_token_accuracy": 0.6891495585441589, "num_tokens": 2673820.0, "step": 485, "train/ce_loss": 0.7060211300849915 }, { "epoch": 0.047953332015028674, "step": 485, "train/sim_loss": 0.16796875 }, { "epoch": 0.047953332015028674, "step": 485, "train/total_loss": 0.23857086896896362 }, { "entropy": 9.401921272277832, "epoch": 0.048052204864544194, "mean_token_accuracy": 0.7676300406455994, "num_tokens": 2679336.0, "step": 486, "train/ce_loss": 0.9934128522872925 }, { "epoch": 0.048052204864544194, "step": 486, "train/sim_loss": 0.12109375 }, { "epoch": 0.048052204864544194, "step": 486, "train/total_loss": 0.2204350382089615 }, { "entropy": 9.516729354858398, "epoch": 0.04815107771405972, "mean_token_accuracy": 0.7090908885002136, "num_tokens": 2684769.0, "step": 487, "train/ce_loss": 0.5195057988166809 }, { "epoch": 0.04815107771405972, "step": 487, "train/sim_loss": 0.109375 }, { "epoch": 0.04815107771405972, "step": 487, "train/total_loss": 0.1613255739212036 }, { "entropy": 8.985054016113281, "epoch": 0.04824995056357524, "mean_token_accuracy": 0.686274528503418, "num_tokens": 2690490.0, "step": 488, "train/ce_loss": 1.0323925018310547 }, { "epoch": 0.04824995056357524, "step": 488, "train/sim_loss": 0.12109375 }, { "epoch": 0.04824995056357524, "step": 488, "train/total_loss": 0.2243330031633377 }, { "entropy": 8.947836875915527, "epoch": 0.04834882341309076, "mean_token_accuracy": 0.7350813746452332, "num_tokens": 2696142.0, "step": 489, "train/ce_loss": 1.4516236782073975 }, { "epoch": 0.04834882341309076, "step": 489, "train/sim_loss": 0.12890625 }, { "epoch": 0.04834882341309076, "step": 489, "train/total_loss": 0.2740686237812042 }, { "entropy": 9.164863586425781, "epoch": 0.04844769626260629, "mean_token_accuracy": 0.701381504535675, "num_tokens": 2701793.0, "step": 490, "train/ce_loss": 1.0825601816177368 }, { "epoch": 0.04844769626260629, "step": 490, "train/sim_loss": 0.1640625 }, { "epoch": 0.04844769626260629, "step": 490, "train/total_loss": 0.2723185122013092 }, { "entropy": 9.500570297241211, "epoch": 0.04854656911212181, "mean_token_accuracy": 0.7651869058609009, "num_tokens": 2707249.0, "step": 491, "train/ce_loss": 0.9539029002189636 }, { "epoch": 0.04854656911212181, "step": 491, "train/sim_loss": 0.08203125 }, { "epoch": 0.04854656911212181, "step": 491, "train/total_loss": 0.17742154002189636 }, { "entropy": 9.696844100952148, "epoch": 0.04864544196163734, "mean_token_accuracy": 0.6812030076980591, "num_tokens": 2712562.0, "step": 492, "train/ce_loss": 1.0125566720962524 }, { "epoch": 0.04864544196163734, "step": 492, "train/sim_loss": 0.10546875 }, { "epoch": 0.04864544196163734, "step": 492, "train/total_loss": 0.20672442018985748 }, { "entropy": 9.587976455688477, "epoch": 0.04874431481115286, "mean_token_accuracy": 0.7285902500152588, "num_tokens": 2717994.0, "step": 493, "train/ce_loss": 0.8766855001449585 }, { "epoch": 0.04874431481115286, "step": 493, "train/sim_loss": 0.1171875 }, { "epoch": 0.04874431481115286, "step": 493, "train/total_loss": 0.2048560529947281 }, { "entropy": 9.071504592895508, "epoch": 0.04884318766066838, "mean_token_accuracy": 0.7281553149223328, "num_tokens": 2723545.0, "step": 494, "train/ce_loss": 0.6545592546463013 }, { "epoch": 0.04884318766066838, "step": 494, "train/sim_loss": 0.05859375 }, { "epoch": 0.04884318766066838, "step": 494, "train/total_loss": 0.12404967844486237 }, { "entropy": 9.424535751342773, "epoch": 0.048942060510183905, "mean_token_accuracy": 0.6952381134033203, "num_tokens": 2729018.0, "step": 495, "train/ce_loss": 1.1928917169570923 }, { "epoch": 0.048942060510183905, "step": 495, "train/sim_loss": 0.12109375 }, { "epoch": 0.048942060510183905, "step": 495, "train/total_loss": 0.24038292467594147 }, { "entropy": 9.606765747070312, "epoch": 0.049040933359699425, "mean_token_accuracy": 0.7634854912757874, "num_tokens": 2734426.0, "step": 496, "train/ce_loss": 0.7997003197669983 }, { "epoch": 0.049040933359699425, "step": 496, "train/sim_loss": 0.05859375 }, { "epoch": 0.049040933359699425, "step": 496, "train/total_loss": 0.13856378197669983 }, { "entropy": 9.061918258666992, "epoch": 0.04913980620921495, "mean_token_accuracy": 0.6951438784599304, "num_tokens": 2740068.0, "step": 497, "train/ce_loss": 1.9616315364837646 }, { "epoch": 0.04913980620921495, "step": 497, "train/sim_loss": 0.16796875 }, { "epoch": 0.04913980620921495, "step": 497, "train/total_loss": 0.3641319274902344 }, { "entropy": 9.468648910522461, "epoch": 0.04923867905873047, "mean_token_accuracy": 0.6540880799293518, "num_tokens": 2745522.0, "step": 498, "train/ce_loss": 0.8665443658828735 }, { "epoch": 0.04923867905873047, "step": 498, "train/sim_loss": 0.09375 }, { "epoch": 0.04923867905873047, "step": 498, "train/total_loss": 0.1804044395685196 }, { "entropy": 8.71371078491211, "epoch": 0.04933755190824599, "mean_token_accuracy": 0.6961326003074646, "num_tokens": 2751412.0, "step": 499, "train/ce_loss": 0.586179256439209 }, { "epoch": 0.04933755190824599, "step": 499, "train/sim_loss": 0.0625 }, { "epoch": 0.04933755190824599, "step": 499, "train/total_loss": 0.12111792713403702 }, { "epoch": 0.04943642475776152, "grad_norm": 1.318381428718567, "learning_rate": 9.87909805666815e-06, "loss": 0.2062, "step": 500 }, { "entropy": 9.48853588104248, "epoch": 0.04943642475776152, "mean_token_accuracy": 0.6920332908630371, "num_tokens": 2756830.0, "step": 500, "train/ce_loss": 0.9447694420814514 }, { "epoch": 0.04943642475776152, "step": 500, "train/sim_loss": 0.0859375 }, { "epoch": 0.04943642475776152, "step": 500, "train/total_loss": 0.18041443824768066 }, { "entropy": 9.26751708984375, "epoch": 0.04953529760727704, "mean_token_accuracy": 0.7921225428581238, "num_tokens": 2762446.0, "step": 501, "train/ce_loss": 0.49797874689102173 }, { "epoch": 0.04953529760727704, "step": 501, "train/sim_loss": 0.109375 }, { "epoch": 0.04953529760727704, "step": 501, "train/total_loss": 0.1591728776693344 }, { "entropy": 9.20521354675293, "epoch": 0.04963417045679257, "mean_token_accuracy": 0.7141424417495728, "num_tokens": 2768028.0, "step": 502, "train/ce_loss": 1.9449820518493652 }, { "epoch": 0.04963417045679257, "step": 502, "train/sim_loss": 0.30078125 }, { "epoch": 0.04963417045679257, "step": 502, "train/total_loss": 0.495279461145401 }, { "entropy": 9.47292709350586, "epoch": 0.04973304330630809, "mean_token_accuracy": 0.75844806432724, "num_tokens": 2773478.0, "step": 503, "train/ce_loss": 0.5587320327758789 }, { "epoch": 0.04973304330630809, "step": 503, "train/sim_loss": 0.05078125 }, { "epoch": 0.04973304330630809, "step": 503, "train/total_loss": 0.10665445029735565 }, { "entropy": 9.202969551086426, "epoch": 0.04983191615582361, "mean_token_accuracy": 0.7473347783088684, "num_tokens": 2779067.0, "step": 504, "train/ce_loss": 0.8113332986831665 }, { "epoch": 0.04983191615582361, "step": 504, "train/sim_loss": 0.09765625 }, { "epoch": 0.04983191615582361, "step": 504, "train/total_loss": 0.17878958582878113 }, { "entropy": 9.261568069458008, "epoch": 0.049930789005339135, "mean_token_accuracy": 0.7304170727729797, "num_tokens": 2784629.0, "step": 505, "train/ce_loss": 0.7083986401557922 }, { "epoch": 0.049930789005339135, "step": 505, "train/sim_loss": 0.10546875 }, { "epoch": 0.049930789005339135, "step": 505, "train/total_loss": 0.17630861699581146 }, { "entropy": 9.383980751037598, "epoch": 0.050029661854854655, "mean_token_accuracy": 0.7397727370262146, "num_tokens": 2790116.0, "step": 506, "train/ce_loss": 0.81044602394104 }, { "epoch": 0.050029661854854655, "step": 506, "train/sim_loss": 0.0546875 }, { "epoch": 0.050029661854854655, "step": 506, "train/total_loss": 0.13573211431503296 }, { "entropy": 8.816777229309082, "epoch": 0.05012853470437018, "mean_token_accuracy": 0.7472089529037476, "num_tokens": 2795889.0, "step": 507, "train/ce_loss": 0.7165438532829285 }, { "epoch": 0.05012853470437018, "step": 507, "train/sim_loss": 0.1640625 }, { "epoch": 0.05012853470437018, "step": 507, "train/total_loss": 0.23571687936782837 }, { "entropy": 9.204126358032227, "epoch": 0.0502274075538857, "mean_token_accuracy": 0.734229564666748, "num_tokens": 2801351.0, "step": 508, "train/ce_loss": 0.5308052897453308 }, { "epoch": 0.0502274075538857, "step": 508, "train/sim_loss": 0.05859375 }, { "epoch": 0.0502274075538857, "step": 508, "train/total_loss": 0.11167427897453308 }, { "entropy": 9.166059494018555, "epoch": 0.05032628040340122, "mean_token_accuracy": 0.7677286863327026, "num_tokens": 2806908.0, "step": 509, "train/ce_loss": 0.6715565919876099 }, { "epoch": 0.05032628040340122, "step": 509, "train/sim_loss": 0.07421875 }, { "epoch": 0.05032628040340122, "step": 509, "train/total_loss": 0.141374409198761 }, { "entropy": 9.711586952209473, "epoch": 0.05042515325291675, "mean_token_accuracy": 0.7167139053344727, "num_tokens": 2812261.0, "step": 510, "train/ce_loss": 0.36069923639297485 }, { "epoch": 0.05042515325291675, "step": 510, "train/sim_loss": 0.1015625 }, { "epoch": 0.05042515325291675, "step": 510, "train/total_loss": 0.13763242959976196 }, { "entropy": 9.126091003417969, "epoch": 0.05052402610243227, "mean_token_accuracy": 0.7001023292541504, "num_tokens": 2817902.0, "step": 511, "train/ce_loss": 1.2376735210418701 }, { "epoch": 0.05052402610243227, "step": 511, "train/sim_loss": 0.18359375 }, { "epoch": 0.05052402610243227, "step": 511, "train/total_loss": 0.30736109614372253 }, { "entropy": 9.467564582824707, "epoch": 0.0506228989519478, "mean_token_accuracy": 0.7039238810539246, "num_tokens": 2823377.0, "step": 512, "train/ce_loss": 1.0557565689086914 }, { "epoch": 0.0506228989519478, "step": 512, "train/sim_loss": 0.08203125 }, { "epoch": 0.0506228989519478, "step": 512, "train/total_loss": 0.18760690093040466 }, { "entropy": 9.431827545166016, "epoch": 0.05072177180146332, "mean_token_accuracy": 0.7112069129943848, "num_tokens": 2828702.0, "step": 513, "train/ce_loss": 0.8137829899787903 }, { "epoch": 0.05072177180146332, "step": 513, "train/sim_loss": 0.109375 }, { "epoch": 0.05072177180146332, "step": 513, "train/total_loss": 0.19075331091880798 }, { "entropy": 9.06843090057373, "epoch": 0.05082064465097884, "mean_token_accuracy": 0.7438905239105225, "num_tokens": 2834299.0, "step": 514, "train/ce_loss": 0.7045574188232422 }, { "epoch": 0.05082064465097884, "step": 514, "train/sim_loss": 0.0546875 }, { "epoch": 0.05082064465097884, "step": 514, "train/total_loss": 0.12514324486255646 }, { "entropy": 9.382444381713867, "epoch": 0.050919517500494366, "mean_token_accuracy": 0.7312430143356323, "num_tokens": 2839832.0, "step": 515, "train/ce_loss": 0.5947422981262207 }, { "epoch": 0.050919517500494366, "step": 515, "train/sim_loss": 0.1484375 }, { "epoch": 0.050919517500494366, "step": 515, "train/total_loss": 0.20791172981262207 }, { "entropy": 9.267562866210938, "epoch": 0.051018390350009886, "mean_token_accuracy": 0.7443991899490356, "num_tokens": 2845455.0, "step": 516, "train/ce_loss": 1.103865146636963 }, { "epoch": 0.051018390350009886, "step": 516, "train/sim_loss": 0.10546875 }, { "epoch": 0.051018390350009886, "step": 516, "train/total_loss": 0.21585527062416077 }, { "entropy": 8.959218978881836, "epoch": 0.05111726319952541, "mean_token_accuracy": 0.7666948437690735, "num_tokens": 2851383.0, "step": 517, "train/ce_loss": 1.309093952178955 }, { "epoch": 0.05111726319952541, "step": 517, "train/sim_loss": 0.10546875 }, { "epoch": 0.05111726319952541, "step": 517, "train/total_loss": 0.23637814819812775 }, { "entropy": 9.699556350708008, "epoch": 0.05121613604904093, "mean_token_accuracy": 0.7335127592086792, "num_tokens": 2856706.0, "step": 518, "train/ce_loss": 0.9762914180755615 }, { "epoch": 0.05121613604904093, "step": 518, "train/sim_loss": 0.109375 }, { "epoch": 0.05121613604904093, "step": 518, "train/total_loss": 0.2070041447877884 }, { "entropy": 9.56744384765625, "epoch": 0.051315008898556454, "mean_token_accuracy": 0.7709359526634216, "num_tokens": 2862074.0, "step": 519, "train/ce_loss": 0.8770809173583984 }, { "epoch": 0.051315008898556454, "step": 519, "train/sim_loss": 0.125 }, { "epoch": 0.051315008898556454, "step": 519, "train/total_loss": 0.21270808577537537 }, { "epoch": 0.05141388174807198, "grad_norm": 1.0875126123428345, "learning_rate": 9.874153191910203e-06, "loss": 0.1939, "step": 520 }, { "entropy": 9.37334156036377, "epoch": 0.05141388174807198, "mean_token_accuracy": 0.721615731716156, "num_tokens": 2867528.0, "step": 520, "train/ce_loss": 0.8396583199501038 }, { "epoch": 0.05141388174807198, "step": 520, "train/sim_loss": 0.046875 }, { "epoch": 0.05141388174807198, "step": 520, "train/total_loss": 0.13084083795547485 }, { "entropy": 9.793841361999512, "epoch": 0.0515127545975875, "mean_token_accuracy": 0.7525473237037659, "num_tokens": 2872864.0, "step": 521, "train/ce_loss": 0.912937581539154 }, { "epoch": 0.0515127545975875, "step": 521, "train/sim_loss": 0.16796875 }, { "epoch": 0.0515127545975875, "step": 521, "train/total_loss": 0.2592625021934509 }, { "entropy": 9.469568252563477, "epoch": 0.05161162744710303, "mean_token_accuracy": 0.7578814625740051, "num_tokens": 2878214.0, "step": 522, "train/ce_loss": 0.8553126454353333 }, { "epoch": 0.05161162744710303, "step": 522, "train/sim_loss": 0.08984375 }, { "epoch": 0.05161162744710303, "step": 522, "train/total_loss": 0.17537501454353333 }, { "entropy": 8.857406616210938, "epoch": 0.05171050029661855, "mean_token_accuracy": 0.6931818127632141, "num_tokens": 2884135.0, "step": 523, "train/ce_loss": 0.6890257000923157 }, { "epoch": 0.05171050029661855, "step": 523, "train/sim_loss": 0.109375 }, { "epoch": 0.05171050029661855, "step": 523, "train/total_loss": 0.17827758193016052 }, { "entropy": 9.328811645507812, "epoch": 0.05180937314613407, "mean_token_accuracy": 0.7873303294181824, "num_tokens": 2889604.0, "step": 524, "train/ce_loss": 0.40848708152770996 }, { "epoch": 0.05180937314613407, "step": 524, "train/sim_loss": 0.05078125 }, { "epoch": 0.05180937314613407, "step": 524, "train/total_loss": 0.09162995964288712 }, { "entropy": 9.330764770507812, "epoch": 0.051908245995649596, "mean_token_accuracy": 0.6985539197921753, "num_tokens": 2895069.0, "step": 525, "train/ce_loss": 0.8234325051307678 }, { "epoch": 0.051908245995649596, "step": 525, "train/sim_loss": 0.0703125 }, { "epoch": 0.051908245995649596, "step": 525, "train/total_loss": 0.15265575051307678 }, { "entropy": 9.51142406463623, "epoch": 0.052007118845165116, "mean_token_accuracy": 0.7389221787452698, "num_tokens": 2900729.0, "step": 526, "train/ce_loss": 0.7845601439476013 }, { "epoch": 0.052007118845165116, "step": 526, "train/sim_loss": 0.078125 }, { "epoch": 0.052007118845165116, "step": 526, "train/total_loss": 0.15658101439476013 }, { "entropy": 9.544654846191406, "epoch": 0.052105991694680644, "mean_token_accuracy": 0.743658185005188, "num_tokens": 2906108.0, "step": 527, "train/ce_loss": 0.6810246706008911 }, { "epoch": 0.052105991694680644, "step": 527, "train/sim_loss": 0.09765625 }, { "epoch": 0.052105991694680644, "step": 527, "train/total_loss": 0.16575872898101807 }, { "entropy": 9.355021476745605, "epoch": 0.052204864544196164, "mean_token_accuracy": 0.7525083422660828, "num_tokens": 2911600.0, "step": 528, "train/ce_loss": 0.4927864074707031 }, { "epoch": 0.052204864544196164, "step": 528, "train/sim_loss": 0.1171875 }, { "epoch": 0.052204864544196164, "step": 528, "train/total_loss": 0.1664661467075348 }, { "entropy": 9.588223457336426, "epoch": 0.052303737393711684, "mean_token_accuracy": 0.73046875, "num_tokens": 2916987.0, "step": 529, "train/ce_loss": 0.6923254132270813 }, { "epoch": 0.052303737393711684, "step": 529, "train/sim_loss": 0.171875 }, { "epoch": 0.052303737393711684, "step": 529, "train/total_loss": 0.24110755324363708 }, { "entropy": 8.990463256835938, "epoch": 0.05240261024322721, "mean_token_accuracy": 0.764598548412323, "num_tokens": 2922755.0, "step": 530, "train/ce_loss": 0.5622440576553345 }, { "epoch": 0.05240261024322721, "step": 530, "train/sim_loss": 0.04296875 }, { "epoch": 0.05240261024322721, "step": 530, "train/total_loss": 0.09919315576553345 }, { "entropy": 9.343250274658203, "epoch": 0.05250148309274273, "mean_token_accuracy": 0.7751141786575317, "num_tokens": 2928229.0, "step": 531, "train/ce_loss": 1.0644227266311646 }, { "epoch": 0.05250148309274273, "step": 531, "train/sim_loss": 0.08984375 }, { "epoch": 0.05250148309274273, "step": 531, "train/total_loss": 0.19628602266311646 }, { "entropy": 9.362048149108887, "epoch": 0.05260035594225826, "mean_token_accuracy": 0.7697756886482239, "num_tokens": 2933663.0, "step": 532, "train/ce_loss": 0.8950389623641968 }, { "epoch": 0.05260035594225826, "step": 532, "train/sim_loss": 0.109375 }, { "epoch": 0.05260035594225826, "step": 532, "train/total_loss": 0.19887889921665192 }, { "entropy": 9.281539916992188, "epoch": 0.05269922879177378, "mean_token_accuracy": 0.8042327761650085, "num_tokens": 2939227.0, "step": 533, "train/ce_loss": 0.44673600792884827 }, { "epoch": 0.05269922879177378, "step": 533, "train/sim_loss": 0.125 }, { "epoch": 0.05269922879177378, "step": 533, "train/total_loss": 0.1696736067533493 }, { "entropy": 9.555377960205078, "epoch": 0.0527981016412893, "mean_token_accuracy": 0.7081218361854553, "num_tokens": 2944671.0, "step": 534, "train/ce_loss": 0.7984234690666199 }, { "epoch": 0.0527981016412893, "step": 534, "train/sim_loss": 0.0859375 }, { "epoch": 0.0527981016412893, "step": 534, "train/total_loss": 0.16577985882759094 }, { "entropy": 9.36761474609375, "epoch": 0.05289697449080483, "mean_token_accuracy": 0.7269663214683533, "num_tokens": 2950083.0, "step": 535, "train/ce_loss": 0.5350989699363708 }, { "epoch": 0.05289697449080483, "step": 535, "train/sim_loss": 0.04296875 }, { "epoch": 0.05289697449080483, "step": 535, "train/total_loss": 0.0964786484837532 }, { "entropy": 9.205673217773438, "epoch": 0.05299584734032035, "mean_token_accuracy": 0.6875612139701843, "num_tokens": 2955665.0, "step": 536, "train/ce_loss": 0.5063661932945251 }, { "epoch": 0.05299584734032035, "step": 536, "train/sim_loss": 0.09765625 }, { "epoch": 0.05299584734032035, "step": 536, "train/total_loss": 0.14829286932945251 }, { "entropy": 9.454233169555664, "epoch": 0.053094720189835874, "mean_token_accuracy": 0.6971279382705688, "num_tokens": 2961038.0, "step": 537, "train/ce_loss": 0.7999914288520813 }, { "epoch": 0.053094720189835874, "step": 537, "train/sim_loss": 0.1328125 }, { "epoch": 0.053094720189835874, "step": 537, "train/total_loss": 0.2128116488456726 }, { "entropy": 9.536530494689941, "epoch": 0.053193593039351394, "mean_token_accuracy": 0.7711978554725647, "num_tokens": 2966432.0, "step": 538, "train/ce_loss": 0.7656124234199524 }, { "epoch": 0.053193593039351394, "step": 538, "train/sim_loss": 0.1328125 }, { "epoch": 0.053193593039351394, "step": 538, "train/total_loss": 0.20937374234199524 }, { "entropy": 9.204168319702148, "epoch": 0.053292465888866915, "mean_token_accuracy": 0.705567479133606, "num_tokens": 2971986.0, "step": 539, "train/ce_loss": 0.7412825226783752 }, { "epoch": 0.053292465888866915, "step": 539, "train/sim_loss": 0.09765625 }, { "epoch": 0.053292465888866915, "step": 539, "train/total_loss": 0.17178450524806976 }, { "epoch": 0.05339133873838244, "grad_norm": 1.1456598043441772, "learning_rate": 9.869208327152253e-06, "loss": 0.1874, "step": 540 }, { "entropy": 9.180940628051758, "epoch": 0.05339133873838244, "mean_token_accuracy": 0.7071038484573364, "num_tokens": 2977578.0, "step": 540, "train/ce_loss": 1.0489882230758667 }, { "epoch": 0.05339133873838244, "step": 540, "train/sim_loss": 0.06640625 }, { "epoch": 0.05339133873838244, "step": 540, "train/total_loss": 0.1713050752878189 }, { "entropy": 9.232544898986816, "epoch": 0.05349021158789796, "mean_token_accuracy": 0.7557651996612549, "num_tokens": 2983376.0, "step": 541, "train/ce_loss": 0.5503873229026794 }, { "epoch": 0.05349021158789796, "step": 541, "train/sim_loss": 0.1484375 }, { "epoch": 0.05349021158789796, "step": 541, "train/total_loss": 0.20347623527050018 }, { "entropy": 9.344259262084961, "epoch": 0.05358908443741349, "mean_token_accuracy": 0.8083242177963257, "num_tokens": 2988919.0, "step": 542, "train/ce_loss": 0.5949644446372986 }, { "epoch": 0.05358908443741349, "step": 542, "train/sim_loss": 0.0546875 }, { "epoch": 0.05358908443741349, "step": 542, "train/total_loss": 0.1141839474439621 }, { "entropy": 9.014297485351562, "epoch": 0.05368795728692901, "mean_token_accuracy": 0.7775551080703735, "num_tokens": 2994580.0, "step": 543, "train/ce_loss": 0.4743942320346832 }, { "epoch": 0.05368795728692901, "step": 543, "train/sim_loss": 0.0546875 }, { "epoch": 0.05368795728692901, "step": 543, "train/total_loss": 0.10212692618370056 }, { "entropy": 9.493232727050781, "epoch": 0.05378683013644453, "mean_token_accuracy": 0.6486486196517944, "num_tokens": 2999993.0, "step": 544, "train/ce_loss": 1.3152939081192017 }, { "epoch": 0.05378683013644453, "step": 544, "train/sim_loss": 0.09765625 }, { "epoch": 0.05378683013644453, "step": 544, "train/total_loss": 0.22918564081192017 }, { "entropy": 9.63710880279541, "epoch": 0.05388570298596006, "mean_token_accuracy": 0.7587600946426392, "num_tokens": 3005255.0, "step": 545, "train/ce_loss": 0.5576032996177673 }, { "epoch": 0.05388570298596006, "step": 545, "train/sim_loss": 0.265625 }, { "epoch": 0.05388570298596006, "step": 545, "train/total_loss": 0.32138532400131226 }, { "entropy": 9.676552772521973, "epoch": 0.05398457583547558, "mean_token_accuracy": 0.7557142972946167, "num_tokens": 3010651.0, "step": 546, "train/ce_loss": 0.7571379542350769 }, { "epoch": 0.05398457583547558, "step": 546, "train/sim_loss": 0.1171875 }, { "epoch": 0.05398457583547558, "step": 546, "train/total_loss": 0.19290129840373993 }, { "entropy": 9.538562774658203, "epoch": 0.054083448684991105, "mean_token_accuracy": 0.7320987582206726, "num_tokens": 3016064.0, "step": 547, "train/ce_loss": 0.49051302671432495 }, { "epoch": 0.054083448684991105, "step": 547, "train/sim_loss": 0.05859375 }, { "epoch": 0.054083448684991105, "step": 547, "train/total_loss": 0.10764504969120026 }, { "entropy": 9.431295394897461, "epoch": 0.054182321534506625, "mean_token_accuracy": 0.7286902070045471, "num_tokens": 3021662.0, "step": 548, "train/ce_loss": 0.5616965293884277 }, { "epoch": 0.054182321534506625, "step": 548, "train/sim_loss": 0.09375 }, { "epoch": 0.054182321534506625, "step": 548, "train/total_loss": 0.14991965889930725 }, { "entropy": 9.664276123046875, "epoch": 0.054281194384022145, "mean_token_accuracy": 0.7580872178077698, "num_tokens": 3027059.0, "step": 549, "train/ce_loss": 0.9629098773002625 }, { "epoch": 0.054281194384022145, "step": 549, "train/sim_loss": 0.0625 }, { "epoch": 0.054281194384022145, "step": 549, "train/total_loss": 0.15879099071025848 }, { "entropy": 9.312294006347656, "epoch": 0.05438006723353767, "mean_token_accuracy": 0.718358039855957, "num_tokens": 3032549.0, "step": 550, "train/ce_loss": 0.7835822701454163 }, { "epoch": 0.05438006723353767, "step": 550, "train/sim_loss": 0.1015625 }, { "epoch": 0.05438006723353767, "step": 550, "train/total_loss": 0.1799207329750061 }, { "entropy": 9.694437026977539, "epoch": 0.05447894008305319, "mean_token_accuracy": 0.7954220175743103, "num_tokens": 3037854.0, "step": 551, "train/ce_loss": 0.49231329560279846 }, { "epoch": 0.05447894008305319, "step": 551, "train/sim_loss": 0.13671875 }, { "epoch": 0.05447894008305319, "step": 551, "train/total_loss": 0.18595008552074432 }, { "entropy": 9.28744888305664, "epoch": 0.05457781293256872, "mean_token_accuracy": 0.7611386179924011, "num_tokens": 3043331.0, "step": 552, "train/ce_loss": 0.937905490398407 }, { "epoch": 0.05457781293256872, "step": 552, "train/sim_loss": 0.12109375 }, { "epoch": 0.05457781293256872, "step": 552, "train/total_loss": 0.21488431096076965 }, { "entropy": 9.475110054016113, "epoch": 0.05467668578208424, "mean_token_accuracy": 0.711670458316803, "num_tokens": 3048825.0, "step": 553, "train/ce_loss": 1.5740858316421509 }, { "epoch": 0.05467668578208424, "step": 553, "train/sim_loss": 0.078125 }, { "epoch": 0.05467668578208424, "step": 553, "train/total_loss": 0.23553358018398285 }, { "entropy": 9.370890617370605, "epoch": 0.05477555863159976, "mean_token_accuracy": 0.7410404682159424, "num_tokens": 3054349.0, "step": 554, "train/ce_loss": 0.9403176307678223 }, { "epoch": 0.05477555863159976, "step": 554, "train/sim_loss": 0.078125 }, { "epoch": 0.05477555863159976, "step": 554, "train/total_loss": 0.17215676605701447 }, { "entropy": 9.184944152832031, "epoch": 0.05487443148111529, "mean_token_accuracy": 0.7894211411476135, "num_tokens": 3060125.0, "step": 555, "train/ce_loss": 0.8001033067703247 }, { "epoch": 0.05487443148111529, "step": 555, "train/sim_loss": 0.19921875 }, { "epoch": 0.05487443148111529, "step": 555, "train/total_loss": 0.279229074716568 }, { "entropy": 9.585567474365234, "epoch": 0.05497330433063081, "mean_token_accuracy": 0.7981770634651184, "num_tokens": 3065472.0, "step": 556, "train/ce_loss": 0.9346737265586853 }, { "epoch": 0.05497330433063081, "step": 556, "train/sim_loss": 0.11328125 }, { "epoch": 0.05497330433063081, "step": 556, "train/total_loss": 0.20674863457679749 }, { "entropy": 9.166794776916504, "epoch": 0.055072177180146335, "mean_token_accuracy": 0.7651434540748596, "num_tokens": 3071011.0, "step": 557, "train/ce_loss": 0.6640275120735168 }, { "epoch": 0.055072177180146335, "step": 557, "train/sim_loss": 0.078125 }, { "epoch": 0.055072177180146335, "step": 557, "train/total_loss": 0.14452776312828064 }, { "entropy": 9.047800064086914, "epoch": 0.055171050029661856, "mean_token_accuracy": 0.7125645279884338, "num_tokens": 3076712.0, "step": 558, "train/ce_loss": 0.45814821124076843 }, { "epoch": 0.055171050029661856, "step": 558, "train/sim_loss": 0.1171875 }, { "epoch": 0.055171050029661856, "step": 558, "train/total_loss": 0.16300232708454132 }, { "entropy": 9.55476188659668, "epoch": 0.055269922879177376, "mean_token_accuracy": 0.745148777961731, "num_tokens": 3082093.0, "step": 559, "train/ce_loss": 0.8888428807258606 }, { "epoch": 0.055269922879177376, "step": 559, "train/sim_loss": 0.12109375 }, { "epoch": 0.055269922879177376, "step": 559, "train/total_loss": 0.20997804403305054 }, { "epoch": 0.0553687957286929, "grad_norm": 1.2151108980178833, "learning_rate": 9.864263462394305e-06, "loss": 0.1952, "step": 560 }, { "entropy": 9.18867301940918, "epoch": 0.0553687957286929, "mean_token_accuracy": 0.7615818977355957, "num_tokens": 3087613.0, "step": 560, "train/ce_loss": 1.0846107006072998 }, { "epoch": 0.0553687957286929, "step": 560, "train/sim_loss": 0.1171875 }, { "epoch": 0.0553687957286929, "step": 560, "train/total_loss": 0.22564858198165894 }, { "entropy": 9.666751861572266, "epoch": 0.05546766857820842, "mean_token_accuracy": 0.7518355250358582, "num_tokens": 3092878.0, "step": 561, "train/ce_loss": 0.6968344449996948 }, { "epoch": 0.05546766857820842, "step": 561, "train/sim_loss": 0.1171875 }, { "epoch": 0.05546766857820842, "step": 561, "train/total_loss": 0.18687094748020172 }, { "entropy": 9.358325958251953, "epoch": 0.055566541427723944, "mean_token_accuracy": 0.7331910133361816, "num_tokens": 3098436.0, "step": 562, "train/ce_loss": 1.1954345703125 }, { "epoch": 0.055566541427723944, "step": 562, "train/sim_loss": 0.12109375 }, { "epoch": 0.055566541427723944, "step": 562, "train/total_loss": 0.24063721299171448 }, { "entropy": 9.5902099609375, "epoch": 0.05566541427723947, "mean_token_accuracy": 0.7598978281021118, "num_tokens": 3103819.0, "step": 563, "train/ce_loss": 0.8564356565475464 }, { "epoch": 0.05566541427723947, "step": 563, "train/sim_loss": 0.08203125 }, { "epoch": 0.05566541427723947, "step": 563, "train/total_loss": 0.16767480969429016 }, { "entropy": 9.415127754211426, "epoch": 0.05576428712675499, "mean_token_accuracy": 0.7214285731315613, "num_tokens": 3109289.0, "step": 564, "train/ce_loss": 0.9082950949668884 }, { "epoch": 0.05576428712675499, "step": 564, "train/sim_loss": 0.12109375 }, { "epoch": 0.05576428712675499, "step": 564, "train/total_loss": 0.2119232714176178 }, { "entropy": 9.250295639038086, "epoch": 0.05586315997627052, "mean_token_accuracy": 0.6663265228271484, "num_tokens": 3114931.0, "step": 565, "train/ce_loss": 1.1925748586654663 }, { "epoch": 0.05586315997627052, "step": 565, "train/sim_loss": 0.1171875 }, { "epoch": 0.05586315997627052, "step": 565, "train/total_loss": 0.23644497990608215 }, { "entropy": 9.530217170715332, "epoch": 0.05596203282578604, "mean_token_accuracy": 0.7065637111663818, "num_tokens": 3120331.0, "step": 566, "train/ce_loss": 0.8044545650482178 }, { "epoch": 0.05596203282578604, "step": 566, "train/sim_loss": 0.12109375 }, { "epoch": 0.05596203282578604, "step": 566, "train/total_loss": 0.20153921842575073 }, { "entropy": 9.377005577087402, "epoch": 0.05606090567530156, "mean_token_accuracy": 0.7658079862594604, "num_tokens": 3125805.0, "step": 567, "train/ce_loss": 0.6526786684989929 }, { "epoch": 0.05606090567530156, "step": 567, "train/sim_loss": 0.09375 }, { "epoch": 0.05606090567530156, "step": 567, "train/total_loss": 0.15901786088943481 }, { "entropy": 9.413352966308594, "epoch": 0.056159778524817086, "mean_token_accuracy": 0.7278911471366882, "num_tokens": 3131242.0, "step": 568, "train/ce_loss": 0.7003741264343262 }, { "epoch": 0.056159778524817086, "step": 568, "train/sim_loss": 0.0859375 }, { "epoch": 0.056159778524817086, "step": 568, "train/total_loss": 0.15597492456436157 }, { "entropy": 9.132238388061523, "epoch": 0.056258651374332606, "mean_token_accuracy": 0.7888015508651733, "num_tokens": 3136925.0, "step": 569, "train/ce_loss": 0.48717328906059265 }, { "epoch": 0.056258651374332606, "step": 569, "train/sim_loss": 0.078125 }, { "epoch": 0.056258651374332606, "step": 569, "train/total_loss": 0.12684233486652374 }, { "entropy": 9.700285911560059, "epoch": 0.056357524223848134, "mean_token_accuracy": 0.7170329689979553, "num_tokens": 3142282.0, "step": 570, "train/ce_loss": 1.1539503335952759 }, { "epoch": 0.056357524223848134, "step": 570, "train/sim_loss": 0.1171875 }, { "epoch": 0.056357524223848134, "step": 570, "train/total_loss": 0.23258253931999207 }, { "entropy": 9.489937782287598, "epoch": 0.056456397073363654, "mean_token_accuracy": 0.7766624689102173, "num_tokens": 3147675.0, "step": 571, "train/ce_loss": 0.4929262101650238 }, { "epoch": 0.056456397073363654, "step": 571, "train/sim_loss": 0.08984375 }, { "epoch": 0.056456397073363654, "step": 571, "train/total_loss": 0.13913637399673462 }, { "entropy": 9.153828620910645, "epoch": 0.056555269922879174, "mean_token_accuracy": 0.7549296021461487, "num_tokens": 3153367.0, "step": 572, "train/ce_loss": 1.3664947748184204 }, { "epoch": 0.056555269922879174, "step": 572, "train/sim_loss": 0.07421875 }, { "epoch": 0.056555269922879174, "step": 572, "train/total_loss": 0.2108682245016098 }, { "entropy": 8.949146270751953, "epoch": 0.0566541427723947, "mean_token_accuracy": 0.74866783618927, "num_tokens": 3159168.0, "step": 573, "train/ce_loss": 0.594531774520874 }, { "epoch": 0.0566541427723947, "step": 573, "train/sim_loss": 0.078125 }, { "epoch": 0.0566541427723947, "step": 573, "train/total_loss": 0.13757817447185516 }, { "entropy": 9.480389595031738, "epoch": 0.05675301562191022, "mean_token_accuracy": 0.7423887848854065, "num_tokens": 3164545.0, "step": 574, "train/ce_loss": 0.7080515623092651 }, { "epoch": 0.05675301562191022, "step": 574, "train/sim_loss": 0.09375 }, { "epoch": 0.05675301562191022, "step": 574, "train/total_loss": 0.164555162191391 }, { "entropy": 9.32792854309082, "epoch": 0.05685188847142575, "mean_token_accuracy": 0.6697247624397278, "num_tokens": 3170179.0, "step": 575, "train/ce_loss": 0.9527100324630737 }, { "epoch": 0.05685188847142575, "step": 575, "train/sim_loss": 0.13671875 }, { "epoch": 0.05685188847142575, "step": 575, "train/total_loss": 0.2319897562265396 }, { "entropy": 9.574392318725586, "epoch": 0.05695076132094127, "mean_token_accuracy": 0.688691258430481, "num_tokens": 3175550.0, "step": 576, "train/ce_loss": 0.6494451761245728 }, { "epoch": 0.05695076132094127, "step": 576, "train/sim_loss": 0.09765625 }, { "epoch": 0.05695076132094127, "step": 576, "train/total_loss": 0.16260077059268951 }, { "entropy": 9.501728057861328, "epoch": 0.05704963417045679, "mean_token_accuracy": 0.766581654548645, "num_tokens": 3180983.0, "step": 577, "train/ce_loss": 1.0397523641586304 }, { "epoch": 0.05704963417045679, "step": 577, "train/sim_loss": 0.06640625 }, { "epoch": 0.05704963417045679, "step": 577, "train/total_loss": 0.17038148641586304 }, { "entropy": 9.442649841308594, "epoch": 0.05714850701997232, "mean_token_accuracy": 0.7738814949989319, "num_tokens": 3186457.0, "step": 578, "train/ce_loss": 0.5522263050079346 }, { "epoch": 0.05714850701997232, "step": 578, "train/sim_loss": 0.08984375 }, { "epoch": 0.05714850701997232, "step": 578, "train/total_loss": 0.14506638050079346 }, { "entropy": 9.272985458374023, "epoch": 0.05724737986948784, "mean_token_accuracy": 0.7006960511207581, "num_tokens": 3191952.0, "step": 579, "train/ce_loss": 1.0077366828918457 }, { "epoch": 0.05724737986948784, "step": 579, "train/sim_loss": 0.11328125 }, { "epoch": 0.05724737986948784, "step": 579, "train/total_loss": 0.2140549123287201 }, { "epoch": 0.057346252719003364, "grad_norm": 1.713935136795044, "learning_rate": 9.859318597636356e-06, "loss": 0.1902, "step": 580 }, { "entropy": 9.373992919921875, "epoch": 0.057346252719003364, "mean_token_accuracy": 0.7587034702301025, "num_tokens": 3197382.0, "step": 580, "train/ce_loss": 0.661425769329071 }, { "epoch": 0.057346252719003364, "step": 580, "train/sim_loss": 0.07421875 }, { "epoch": 0.057346252719003364, "step": 580, "train/total_loss": 0.14036133885383606 }, { "entropy": 9.120494842529297, "epoch": 0.057445125568518884, "mean_token_accuracy": 0.7974971532821655, "num_tokens": 3202926.0, "step": 581, "train/ce_loss": 0.8061524033546448 }, { "epoch": 0.057445125568518884, "step": 581, "train/sim_loss": 0.09765625 }, { "epoch": 0.057445125568518884, "step": 581, "train/total_loss": 0.17827150225639343 }, { "entropy": 9.135716438293457, "epoch": 0.057543998418034405, "mean_token_accuracy": 0.7356746792793274, "num_tokens": 3208588.0, "step": 582, "train/ce_loss": 0.4718203544616699 }, { "epoch": 0.057543998418034405, "step": 582, "train/sim_loss": 0.1875 }, { "epoch": 0.057543998418034405, "step": 582, "train/total_loss": 0.23468203842639923 }, { "entropy": 9.263733863830566, "epoch": 0.05764287126754993, "mean_token_accuracy": 0.6825094819068909, "num_tokens": 3214197.0, "step": 583, "train/ce_loss": 1.0188088417053223 }, { "epoch": 0.05764287126754993, "step": 583, "train/sim_loss": 0.09375 }, { "epoch": 0.05764287126754993, "step": 583, "train/total_loss": 0.19563087821006775 }, { "entropy": 9.538263320922852, "epoch": 0.05774174411706545, "mean_token_accuracy": 0.7364531755447388, "num_tokens": 3219564.0, "step": 584, "train/ce_loss": 1.0009828805923462 }, { "epoch": 0.05774174411706545, "step": 584, "train/sim_loss": 0.05859375 }, { "epoch": 0.05774174411706545, "step": 584, "train/total_loss": 0.15869203209877014 }, { "entropy": 9.444204330444336, "epoch": 0.05784061696658098, "mean_token_accuracy": 0.7490347623825073, "num_tokens": 3224902.0, "step": 585, "train/ce_loss": 0.8690897226333618 }, { "epoch": 0.05784061696658098, "step": 585, "train/sim_loss": 0.109375 }, { "epoch": 0.05784061696658098, "step": 585, "train/total_loss": 0.1962839663028717 }, { "entropy": 9.061064720153809, "epoch": 0.0579394898160965, "mean_token_accuracy": 0.8422619104385376, "num_tokens": 3230557.0, "step": 586, "train/ce_loss": 0.5789579153060913 }, { "epoch": 0.0579394898160965, "step": 586, "train/sim_loss": 0.0546875 }, { "epoch": 0.0579394898160965, "step": 586, "train/total_loss": 0.11258329451084137 }, { "entropy": 9.4165620803833, "epoch": 0.05803836266561202, "mean_token_accuracy": 0.7041916251182556, "num_tokens": 3236133.0, "step": 587, "train/ce_loss": 0.7449560761451721 }, { "epoch": 0.05803836266561202, "step": 587, "train/sim_loss": 0.14453125 }, { "epoch": 0.05803836266561202, "step": 587, "train/total_loss": 0.2190268635749817 }, { "entropy": 9.219877243041992, "epoch": 0.05813723551512755, "mean_token_accuracy": 0.7443991899490356, "num_tokens": 3241693.0, "step": 588, "train/ce_loss": 0.9700565934181213 }, { "epoch": 0.05813723551512755, "step": 588, "train/sim_loss": 0.08984375 }, { "epoch": 0.05813723551512755, "step": 588, "train/total_loss": 0.1868494153022766 }, { "entropy": 9.135024070739746, "epoch": 0.05823610836464307, "mean_token_accuracy": 0.7552238702774048, "num_tokens": 3247365.0, "step": 589, "train/ce_loss": 0.657670795917511 }, { "epoch": 0.05823610836464307, "step": 589, "train/sim_loss": 0.03125 }, { "epoch": 0.05823610836464307, "step": 589, "train/total_loss": 0.0970170795917511 }, { "entropy": 9.346161842346191, "epoch": 0.058334981214158595, "mean_token_accuracy": 0.7335562705993652, "num_tokens": 3252840.0, "step": 590, "train/ce_loss": 1.0360523462295532 }, { "epoch": 0.058334981214158595, "step": 590, "train/sim_loss": 0.1328125 }, { "epoch": 0.058334981214158595, "step": 590, "train/total_loss": 0.2364177405834198 }, { "entropy": 9.530954360961914, "epoch": 0.058433854063674115, "mean_token_accuracy": 0.7552083134651184, "num_tokens": 3258212.0, "step": 591, "train/ce_loss": 1.3705047369003296 }, { "epoch": 0.058433854063674115, "step": 591, "train/sim_loss": 0.12109375 }, { "epoch": 0.058433854063674115, "step": 591, "train/total_loss": 0.25814422965049744 }, { "entropy": 9.274154663085938, "epoch": 0.058532726913189635, "mean_token_accuracy": 0.7602591514587402, "num_tokens": 3263741.0, "step": 592, "train/ce_loss": 0.9293384552001953 }, { "epoch": 0.058532726913189635, "step": 592, "train/sim_loss": 0.0859375 }, { "epoch": 0.058532726913189635, "step": 592, "train/total_loss": 0.17887134850025177 }, { "entropy": 9.27614974975586, "epoch": 0.05863159976270516, "mean_token_accuracy": 0.7040951251983643, "num_tokens": 3269179.0, "step": 593, "train/ce_loss": 1.1823933124542236 }, { "epoch": 0.05863159976270516, "step": 593, "train/sim_loss": 0.1015625 }, { "epoch": 0.05863159976270516, "step": 593, "train/total_loss": 0.21980184316635132 }, { "entropy": 9.158742904663086, "epoch": 0.05873047261222068, "mean_token_accuracy": 0.708695650100708, "num_tokens": 3275042.0, "step": 594, "train/ce_loss": 1.7689285278320312 }, { "epoch": 0.05873047261222068, "step": 594, "train/sim_loss": 0.1171875 }, { "epoch": 0.05873047261222068, "step": 594, "train/total_loss": 0.29408037662506104 }, { "entropy": 9.426338195800781, "epoch": 0.05882934546173621, "mean_token_accuracy": 0.7709832191467285, "num_tokens": 3280464.0, "step": 595, "train/ce_loss": 0.5437971353530884 }, { "epoch": 0.05882934546173621, "step": 595, "train/sim_loss": 0.09375 }, { "epoch": 0.05882934546173621, "step": 595, "train/total_loss": 0.14812971651554108 }, { "entropy": 9.40764331817627, "epoch": 0.05892821831125173, "mean_token_accuracy": 0.7159353494644165, "num_tokens": 3285971.0, "step": 596, "train/ce_loss": 0.5695697665214539 }, { "epoch": 0.05892821831125173, "step": 596, "train/sim_loss": 0.0859375 }, { "epoch": 0.05892821831125173, "step": 596, "train/total_loss": 0.14289447665214539 }, { "entropy": 9.057065963745117, "epoch": 0.05902709116076725, "mean_token_accuracy": 0.7403846383094788, "num_tokens": 3291589.0, "step": 597, "train/ce_loss": 0.6980687975883484 }, { "epoch": 0.05902709116076725, "step": 597, "train/sim_loss": 0.12109375 }, { "epoch": 0.05902709116076725, "step": 597, "train/total_loss": 0.19090062379837036 }, { "entropy": 9.478327751159668, "epoch": 0.05912596401028278, "mean_token_accuracy": 0.7151370644569397, "num_tokens": 3297011.0, "step": 598, "train/ce_loss": 0.8376517295837402 }, { "epoch": 0.05912596401028278, "step": 598, "train/sim_loss": 0.0703125 }, { "epoch": 0.05912596401028278, "step": 598, "train/total_loss": 0.1540776789188385 }, { "entropy": 9.06879997253418, "epoch": 0.0592248368597983, "mean_token_accuracy": 0.7408477663993835, "num_tokens": 3302796.0, "step": 599, "train/ce_loss": 0.8432909250259399 }, { "epoch": 0.0592248368597983, "step": 599, "train/sim_loss": 0.10546875 }, { "epoch": 0.0592248368597983, "step": 599, "train/total_loss": 0.18979784846305847 }, { "epoch": 0.059323709709313825, "grad_norm": 1.4222511053085327, "learning_rate": 9.854373732878406e-06, "loss": 0.1843, "step": 600 }, { "entropy": 9.72435188293457, "epoch": 0.059323709709313825, "mean_token_accuracy": 0.7372061014175415, "num_tokens": 3308120.0, "step": 600, "train/ce_loss": 0.7391245365142822 }, { "epoch": 0.059323709709313825, "step": 600, "train/sim_loss": 0.08203125 }, { "epoch": 0.059323709709313825, "step": 600, "train/total_loss": 0.15594370663166046 }, { "entropy": 9.372842788696289, "epoch": 0.059422582558829345, "mean_token_accuracy": 0.7096773982048035, "num_tokens": 3313594.0, "step": 601, "train/ce_loss": 0.5537778735160828 }, { "epoch": 0.059422582558829345, "step": 601, "train/sim_loss": 0.125 }, { "epoch": 0.059422582558829345, "step": 601, "train/total_loss": 0.1803777813911438 }, { "entropy": 9.280401229858398, "epoch": 0.059521455408344866, "mean_token_accuracy": 0.759096622467041, "num_tokens": 3318923.0, "step": 602, "train/ce_loss": 0.8712440133094788 }, { "epoch": 0.059521455408344866, "step": 602, "train/sim_loss": 0.05859375 }, { "epoch": 0.059521455408344866, "step": 602, "train/total_loss": 0.14571815729141235 }, { "entropy": 9.273475646972656, "epoch": 0.05962032825786039, "mean_token_accuracy": 0.7923250794410706, "num_tokens": 3324418.0, "step": 603, "train/ce_loss": 0.7934929132461548 }, { "epoch": 0.05962032825786039, "step": 603, "train/sim_loss": 0.1015625 }, { "epoch": 0.05962032825786039, "step": 603, "train/total_loss": 0.18091179430484772 }, { "entropy": 9.104297637939453, "epoch": 0.05971920110737591, "mean_token_accuracy": 0.7597535848617554, "num_tokens": 3329965.0, "step": 604, "train/ce_loss": 0.8910924792289734 }, { "epoch": 0.05971920110737591, "step": 604, "train/sim_loss": 0.08203125 }, { "epoch": 0.05971920110737591, "step": 604, "train/total_loss": 0.17114049196243286 }, { "entropy": 9.389507293701172, "epoch": 0.05981807395689144, "mean_token_accuracy": 0.6824175715446472, "num_tokens": 3335470.0, "step": 605, "train/ce_loss": 0.927566409111023 }, { "epoch": 0.05981807395689144, "step": 605, "train/sim_loss": 0.125 }, { "epoch": 0.05981807395689144, "step": 605, "train/total_loss": 0.21775664389133453 }, { "entropy": 9.263090133666992, "epoch": 0.05991694680640696, "mean_token_accuracy": 0.7176724076271057, "num_tokens": 3341073.0, "step": 606, "train/ce_loss": 1.4942119121551514 }, { "epoch": 0.05991694680640696, "step": 606, "train/sim_loss": 0.1328125 }, { "epoch": 0.05991694680640696, "step": 606, "train/total_loss": 0.28223371505737305 }, { "entropy": 9.775233268737793, "epoch": 0.06001581965592248, "mean_token_accuracy": 0.7845304012298584, "num_tokens": 3346560.0, "step": 607, "train/ce_loss": 0.6119033098220825 }, { "epoch": 0.06001581965592248, "step": 607, "train/sim_loss": 0.078125 }, { "epoch": 0.06001581965592248, "step": 607, "train/total_loss": 0.13931533694267273 }, { "entropy": 9.33908462524414, "epoch": 0.06011469250543801, "mean_token_accuracy": 0.7785714268684387, "num_tokens": 3352088.0, "step": 608, "train/ce_loss": 0.6551539301872253 }, { "epoch": 0.06011469250543801, "step": 608, "train/sim_loss": 0.125 }, { "epoch": 0.06011469250543801, "step": 608, "train/total_loss": 0.190515398979187 }, { "entropy": 9.221921920776367, "epoch": 0.06021356535495353, "mean_token_accuracy": 0.7081384658813477, "num_tokens": 3357709.0, "step": 609, "train/ce_loss": 0.45772966742515564 }, { "epoch": 0.06021356535495353, "step": 609, "train/sim_loss": 0.05859375 }, { "epoch": 0.06021356535495353, "step": 609, "train/total_loss": 0.1043667197227478 }, { "entropy": 9.469886779785156, "epoch": 0.060312438204469056, "mean_token_accuracy": 0.7845237851142883, "num_tokens": 3363177.0, "step": 610, "train/ce_loss": 0.5438740849494934 }, { "epoch": 0.060312438204469056, "step": 610, "train/sim_loss": 0.10546875 }, { "epoch": 0.060312438204469056, "step": 610, "train/total_loss": 0.1598561555147171 }, { "entropy": 9.43984317779541, "epoch": 0.060411311053984576, "mean_token_accuracy": 0.6926829218864441, "num_tokens": 3368646.0, "step": 611, "train/ce_loss": 0.7203174233436584 }, { "epoch": 0.060411311053984576, "step": 611, "train/sim_loss": 0.109375 }, { "epoch": 0.060411311053984576, "step": 611, "train/total_loss": 0.18140673637390137 }, { "entropy": 9.371284484863281, "epoch": 0.060510183903500096, "mean_token_accuracy": 0.7847380638122559, "num_tokens": 3374134.0, "step": 612, "train/ce_loss": 0.3509823977947235 }, { "epoch": 0.060510183903500096, "step": 612, "train/sim_loss": 0.08203125 }, { "epoch": 0.060510183903500096, "step": 612, "train/total_loss": 0.11712948977947235 }, { "entropy": 9.164239883422852, "epoch": 0.060609056753015624, "mean_token_accuracy": 0.7604485154151917, "num_tokens": 3379798.0, "step": 613, "train/ce_loss": 0.4699164628982544 }, { "epoch": 0.060609056753015624, "step": 613, "train/sim_loss": 0.1328125 }, { "epoch": 0.060609056753015624, "step": 613, "train/total_loss": 0.17980414628982544 }, { "entropy": 9.611525535583496, "epoch": 0.060707929602531144, "mean_token_accuracy": 0.7425997257232666, "num_tokens": 3385195.0, "step": 614, "train/ce_loss": 1.0928311347961426 }, { "epoch": 0.060707929602531144, "step": 614, "train/sim_loss": 0.109375 }, { "epoch": 0.060707929602531144, "step": 614, "train/total_loss": 0.21865811944007874 }, { "entropy": 9.679367065429688, "epoch": 0.06080680245204667, "mean_token_accuracy": 0.7376312017440796, "num_tokens": 3390495.0, "step": 615, "train/ce_loss": 1.0647376775741577 }, { "epoch": 0.06080680245204667, "step": 615, "train/sim_loss": 0.10546875 }, { "epoch": 0.06080680245204667, "step": 615, "train/total_loss": 0.21194252371788025 }, { "entropy": 9.618717193603516, "epoch": 0.06090567530156219, "mean_token_accuracy": 0.699999988079071, "num_tokens": 3395758.0, "step": 616, "train/ce_loss": 1.0412601232528687 }, { "epoch": 0.06090567530156219, "step": 616, "train/sim_loss": 0.06640625 }, { "epoch": 0.06090567530156219, "step": 616, "train/total_loss": 0.1705322563648224 }, { "entropy": 9.32923698425293, "epoch": 0.06100454815107771, "mean_token_accuracy": 0.7197049260139465, "num_tokens": 3401354.0, "step": 617, "train/ce_loss": 0.6299319267272949 }, { "epoch": 0.06100454815107771, "step": 617, "train/sim_loss": 0.1171875 }, { "epoch": 0.06100454815107771, "step": 617, "train/total_loss": 0.18018069863319397 }, { "entropy": 9.457327842712402, "epoch": 0.06110342100059324, "mean_token_accuracy": 0.7178871631622314, "num_tokens": 3406785.0, "step": 618, "train/ce_loss": 0.9152182936668396 }, { "epoch": 0.06110342100059324, "step": 618, "train/sim_loss": 0.109375 }, { "epoch": 0.06110342100059324, "step": 618, "train/total_loss": 0.20089682936668396 }, { "entropy": 9.139745712280273, "epoch": 0.06120229385010876, "mean_token_accuracy": 0.7585868239402771, "num_tokens": 3412396.0, "step": 619, "train/ce_loss": 0.4903017580509186 }, { "epoch": 0.06120229385010876, "step": 619, "train/sim_loss": 0.046875 }, { "epoch": 0.06120229385010876, "step": 619, "train/total_loss": 0.09590517729520798 }, { "epoch": 0.061301166699624286, "grad_norm": 1.1261823177337646, "learning_rate": 9.849428868120457e-06, "loss": 0.1866, "step": 620 }, { "entropy": 8.985061645507812, "epoch": 0.061301166699624286, "mean_token_accuracy": 0.7536108493804932, "num_tokens": 3418156.0, "step": 620, "train/ce_loss": 0.5508598685264587 }, { "epoch": 0.061301166699624286, "step": 620, "train/sim_loss": 0.05078125 }, { "epoch": 0.061301166699624286, "step": 620, "train/total_loss": 0.10586723685264587 }, { "entropy": 9.626968383789062, "epoch": 0.06140003954913981, "mean_token_accuracy": 0.7137355804443359, "num_tokens": 3423467.0, "step": 621, "train/ce_loss": 0.9002852439880371 }, { "epoch": 0.06140003954913981, "step": 621, "train/sim_loss": 0.109375 }, { "epoch": 0.06140003954913981, "step": 621, "train/total_loss": 0.1994035243988037 }, { "entropy": 9.582820892333984, "epoch": 0.06149891239865533, "mean_token_accuracy": 0.7092866897583008, "num_tokens": 3428869.0, "step": 622, "train/ce_loss": 1.1821781396865845 }, { "epoch": 0.06149891239865533, "step": 622, "train/sim_loss": 0.12109375 }, { "epoch": 0.06149891239865533, "step": 622, "train/total_loss": 0.2393115758895874 }, { "entropy": 9.171717643737793, "epoch": 0.061597785248170854, "mean_token_accuracy": 0.7873563170433044, "num_tokens": 3434506.0, "step": 623, "train/ce_loss": 0.6770414710044861 }, { "epoch": 0.061597785248170854, "step": 623, "train/sim_loss": 0.1015625 }, { "epoch": 0.061597785248170854, "step": 623, "train/total_loss": 0.16926664113998413 }, { "entropy": 9.378104209899902, "epoch": 0.061696658097686374, "mean_token_accuracy": 0.7382256388664246, "num_tokens": 3439999.0, "step": 624, "train/ce_loss": 0.6349619626998901 }, { "epoch": 0.061696658097686374, "step": 624, "train/sim_loss": 0.0390625 }, { "epoch": 0.061696658097686374, "step": 624, "train/total_loss": 0.1025586947798729 }, { "entropy": 9.522222518920898, "epoch": 0.0617955309472019, "mean_token_accuracy": 0.7840313911437988, "num_tokens": 3445409.0, "step": 625, "train/ce_loss": 0.626710057258606 }, { "epoch": 0.0617955309472019, "step": 625, "train/sim_loss": 0.1171875 }, { "epoch": 0.0617955309472019, "step": 625, "train/total_loss": 0.1798585057258606 }, { "entropy": 9.386995315551758, "epoch": 0.06189440379671742, "mean_token_accuracy": 0.792941153049469, "num_tokens": 3450849.0, "step": 626, "train/ce_loss": 0.5324215888977051 }, { "epoch": 0.06189440379671742, "step": 626, "train/sim_loss": 0.0546875 }, { "epoch": 0.06189440379671742, "step": 626, "train/total_loss": 0.10792966187000275 }, { "entropy": 9.344130516052246, "epoch": 0.06199327664623294, "mean_token_accuracy": 0.7071129679679871, "num_tokens": 3456338.0, "step": 627, "train/ce_loss": 0.5877262949943542 }, { "epoch": 0.06199327664623294, "step": 627, "train/sim_loss": 0.1015625 }, { "epoch": 0.06199327664623294, "step": 627, "train/total_loss": 0.16033512353897095 }, { "entropy": 9.496828079223633, "epoch": 0.06209214949574847, "mean_token_accuracy": 0.8224181532859802, "num_tokens": 3461742.0, "step": 628, "train/ce_loss": 0.550769031047821 }, { "epoch": 0.06209214949574847, "step": 628, "train/sim_loss": 0.078125 }, { "epoch": 0.06209214949574847, "step": 628, "train/total_loss": 0.13320189714431763 }, { "entropy": 9.640973091125488, "epoch": 0.06219102234526399, "mean_token_accuracy": 0.715584397315979, "num_tokens": 3467077.0, "step": 629, "train/ce_loss": 1.5471653938293457 }, { "epoch": 0.06219102234526399, "step": 629, "train/sim_loss": 0.08984375 }, { "epoch": 0.06219102234526399, "step": 629, "train/total_loss": 0.24456028640270233 }, { "entropy": 9.261007308959961, "epoch": 0.06228989519477952, "mean_token_accuracy": 0.7809224128723145, "num_tokens": 3472559.0, "step": 630, "train/ce_loss": 0.4331835210323334 }, { "epoch": 0.06228989519477952, "step": 630, "train/sim_loss": 0.0625 }, { "epoch": 0.06228989519477952, "step": 630, "train/total_loss": 0.10581835359334946 }, { "entropy": 9.471304893493652, "epoch": 0.06238876804429504, "mean_token_accuracy": 0.6617143154144287, "num_tokens": 3478064.0, "step": 631, "train/ce_loss": 1.171181082725525 }, { "epoch": 0.06238876804429504, "step": 631, "train/sim_loss": 0.13671875 }, { "epoch": 0.06238876804429504, "step": 631, "train/total_loss": 0.25383687019348145 }, { "entropy": 9.293900489807129, "epoch": 0.06248764089381056, "mean_token_accuracy": 0.7802547812461853, "num_tokens": 3483539.0, "step": 632, "train/ce_loss": 0.6460439562797546 }, { "epoch": 0.06248764089381056, "step": 632, "train/sim_loss": 0.046875 }, { "epoch": 0.06248764089381056, "step": 632, "train/total_loss": 0.11147939413785934 }, { "entropy": 9.508598327636719, "epoch": 0.06258651374332608, "mean_token_accuracy": 0.7265415787696838, "num_tokens": 3488864.0, "step": 633, "train/ce_loss": 0.7199188470840454 }, { "epoch": 0.06258651374332608, "step": 633, "train/sim_loss": 0.08984375 }, { "epoch": 0.06258651374332608, "step": 633, "train/total_loss": 0.16183564066886902 }, { "entropy": 9.362704277038574, "epoch": 0.0626853865928416, "mean_token_accuracy": 0.7112231850624084, "num_tokens": 3494291.0, "step": 634, "train/ce_loss": 1.038554310798645 }, { "epoch": 0.0626853865928416, "step": 634, "train/sim_loss": 0.140625 }, { "epoch": 0.0626853865928416, "step": 634, "train/total_loss": 0.2444804310798645 }, { "entropy": 9.309609413146973, "epoch": 0.06278425944235713, "mean_token_accuracy": 0.7313609719276428, "num_tokens": 3499793.0, "step": 635, "train/ce_loss": 1.3853336572647095 }, { "epoch": 0.06278425944235713, "step": 635, "train/sim_loss": 0.125 }, { "epoch": 0.06278425944235713, "step": 635, "train/total_loss": 0.263533353805542 }, { "entropy": 8.651618003845215, "epoch": 0.06288313229187265, "mean_token_accuracy": 0.7612156271934509, "num_tokens": 3505808.0, "step": 636, "train/ce_loss": 1.8713984489440918 }, { "epoch": 0.06288313229187265, "step": 636, "train/sim_loss": 0.09375 }, { "epoch": 0.06288313229187265, "step": 636, "train/total_loss": 0.2808898687362671 }, { "entropy": 9.224616050720215, "epoch": 0.06298200514138817, "mean_token_accuracy": 0.7259953022003174, "num_tokens": 3511262.0, "step": 637, "train/ce_loss": 0.6112634539604187 }, { "epoch": 0.06298200514138817, "step": 637, "train/sim_loss": 0.07421875 }, { "epoch": 0.06298200514138817, "step": 637, "train/total_loss": 0.13534510135650635 }, { "entropy": 9.329656600952148, "epoch": 0.0630808779909037, "mean_token_accuracy": 0.7238895297050476, "num_tokens": 3516659.0, "step": 638, "train/ce_loss": 0.7000834941864014 }, { "epoch": 0.0630808779909037, "step": 638, "train/sim_loss": 0.0625 }, { "epoch": 0.0630808779909037, "step": 638, "train/total_loss": 0.13250835239887238 }, { "entropy": 9.307123184204102, "epoch": 0.06317975084041923, "mean_token_accuracy": 0.6900296211242676, "num_tokens": 3522475.0, "step": 639, "train/ce_loss": 1.0382031202316284 }, { "epoch": 0.06317975084041923, "step": 639, "train/sim_loss": 0.09375 }, { "epoch": 0.06317975084041923, "step": 639, "train/total_loss": 0.1975703239440918 }, { "epoch": 0.06327862368993474, "grad_norm": 1.3117868900299072, "learning_rate": 9.844484003362509e-06, "loss": 0.1807, "step": 640 }, { "entropy": 9.34765338897705, "epoch": 0.06327862368993474, "mean_token_accuracy": 0.704935610294342, "num_tokens": 3528005.0, "step": 640, "train/ce_loss": 0.7464196085929871 }, { "epoch": 0.06327862368993474, "step": 640, "train/sim_loss": 0.109375 }, { "epoch": 0.06327862368993474, "step": 640, "train/total_loss": 0.18401697278022766 }, { "entropy": 9.76870346069336, "epoch": 0.06337749653945027, "mean_token_accuracy": 0.7291338443756104, "num_tokens": 3533253.0, "step": 641, "train/ce_loss": 0.8550685048103333 }, { "epoch": 0.06337749653945027, "step": 641, "train/sim_loss": 0.125 }, { "epoch": 0.06337749653945027, "step": 641, "train/total_loss": 0.2105068564414978 }, { "entropy": 9.638051986694336, "epoch": 0.0634763693889658, "mean_token_accuracy": 0.7315521836280823, "num_tokens": 3538621.0, "step": 642, "train/ce_loss": 0.7263368368148804 }, { "epoch": 0.0634763693889658, "step": 642, "train/sim_loss": 0.09375 }, { "epoch": 0.0634763693889658, "step": 642, "train/total_loss": 0.16638368368148804 }, { "entropy": 9.206868171691895, "epoch": 0.06357524223848131, "mean_token_accuracy": 0.7283950448036194, "num_tokens": 3544261.0, "step": 643, "train/ce_loss": 1.1094578504562378 }, { "epoch": 0.06357524223848131, "step": 643, "train/sim_loss": 0.1875 }, { "epoch": 0.06357524223848131, "step": 643, "train/total_loss": 0.29844579100608826 }, { "entropy": 9.791531562805176, "epoch": 0.06367411508799684, "mean_token_accuracy": 0.7485029697418213, "num_tokens": 3549594.0, "step": 644, "train/ce_loss": 0.6026373505592346 }, { "epoch": 0.06367411508799684, "step": 644, "train/sim_loss": 0.109375 }, { "epoch": 0.06367411508799684, "step": 644, "train/total_loss": 0.1696387380361557 }, { "entropy": 9.352677345275879, "epoch": 0.06377298793751236, "mean_token_accuracy": 0.694932758808136, "num_tokens": 3555120.0, "step": 645, "train/ce_loss": 1.2912312746047974 }, { "epoch": 0.06377298793751236, "step": 645, "train/sim_loss": 0.1171875 }, { "epoch": 0.06377298793751236, "step": 645, "train/total_loss": 0.24631063640117645 }, { "entropy": 9.437315940856934, "epoch": 0.06387186078702788, "mean_token_accuracy": 0.7580453157424927, "num_tokens": 3560577.0, "step": 646, "train/ce_loss": 0.4853718876838684 }, { "epoch": 0.06387186078702788, "step": 646, "train/sim_loss": 0.0625 }, { "epoch": 0.06387186078702788, "step": 646, "train/total_loss": 0.11103719472885132 }, { "entropy": 9.185730934143066, "epoch": 0.0639707336365434, "mean_token_accuracy": 0.768278956413269, "num_tokens": 3566112.0, "step": 647, "train/ce_loss": 0.6532512307167053 }, { "epoch": 0.0639707336365434, "step": 647, "train/sim_loss": 0.05078125 }, { "epoch": 0.0639707336365434, "step": 647, "train/total_loss": 0.11610637605190277 }, { "entropy": 9.393474578857422, "epoch": 0.06406960648605893, "mean_token_accuracy": 0.7328605055809021, "num_tokens": 3571591.0, "step": 648, "train/ce_loss": 0.9369293451309204 }, { "epoch": 0.06406960648605893, "step": 648, "train/sim_loss": 0.12109375 }, { "epoch": 0.06406960648605893, "step": 648, "train/total_loss": 0.21478667855262756 }, { "entropy": 9.346091270446777, "epoch": 0.06416847933557446, "mean_token_accuracy": 0.6969365477561951, "num_tokens": 3577171.0, "step": 649, "train/ce_loss": 0.5856436491012573 }, { "epoch": 0.06416847933557446, "step": 649, "train/sim_loss": 0.125 }, { "epoch": 0.06416847933557446, "step": 649, "train/total_loss": 0.18356436491012573 }, { "entropy": 9.219921112060547, "epoch": 0.06426735218508997, "mean_token_accuracy": 0.7808219194412231, "num_tokens": 3582573.0, "step": 650, "train/ce_loss": 0.6007207632064819 }, { "epoch": 0.06426735218508997, "step": 650, "train/sim_loss": 0.09375 }, { "epoch": 0.06426735218508997, "step": 650, "train/total_loss": 0.15382207930088043 }, { "entropy": 9.223957061767578, "epoch": 0.0643662250346055, "mean_token_accuracy": 0.6956989169120789, "num_tokens": 3588152.0, "step": 651, "train/ce_loss": 0.9352828860282898 }, { "epoch": 0.0643662250346055, "step": 651, "train/sim_loss": 0.12109375 }, { "epoch": 0.0643662250346055, "step": 651, "train/total_loss": 0.21462205052375793 }, { "entropy": 9.076240539550781, "epoch": 0.06446509788412103, "mean_token_accuracy": 0.7065813541412354, "num_tokens": 3593796.0, "step": 652, "train/ce_loss": 1.8695961236953735 }, { "epoch": 0.06446509788412103, "step": 652, "train/sim_loss": 0.09375 }, { "epoch": 0.06446509788412103, "step": 652, "train/total_loss": 0.2807096242904663 }, { "entropy": 9.509477615356445, "epoch": 0.06456397073363654, "mean_token_accuracy": 0.752043604850769, "num_tokens": 3599207.0, "step": 653, "train/ce_loss": 1.0708311796188354 }, { "epoch": 0.06456397073363654, "step": 653, "train/sim_loss": 0.10546875 }, { "epoch": 0.06456397073363654, "step": 653, "train/total_loss": 0.21255186200141907 }, { "entropy": 9.102299690246582, "epoch": 0.06466284358315207, "mean_token_accuracy": 0.7051962018013, "num_tokens": 3604867.0, "step": 654, "train/ce_loss": 1.24672269821167 }, { "epoch": 0.06466284358315207, "step": 654, "train/sim_loss": 0.171875 }, { "epoch": 0.06466284358315207, "step": 654, "train/total_loss": 0.2965472638607025 }, { "entropy": 9.670784950256348, "epoch": 0.0647617164326676, "mean_token_accuracy": 0.7203994393348694, "num_tokens": 3610192.0, "step": 655, "train/ce_loss": 0.871415913105011 }, { "epoch": 0.0647617164326676, "step": 655, "train/sim_loss": 0.0546875 }, { "epoch": 0.0647617164326676, "step": 655, "train/total_loss": 0.14182910323143005 }, { "entropy": 9.134929656982422, "epoch": 0.0648605892821831, "mean_token_accuracy": 0.7370558381080627, "num_tokens": 3615825.0, "step": 656, "train/ce_loss": 0.5548173785209656 }, { "epoch": 0.0648605892821831, "step": 656, "train/sim_loss": 0.0859375 }, { "epoch": 0.0648605892821831, "step": 656, "train/total_loss": 0.14141923189163208 }, { "entropy": 9.652360916137695, "epoch": 0.06495946213169863, "mean_token_accuracy": 0.7155050039291382, "num_tokens": 3621175.0, "step": 657, "train/ce_loss": 1.0125712156295776 }, { "epoch": 0.06495946213169863, "step": 657, "train/sim_loss": 0.1484375 }, { "epoch": 0.06495946213169863, "step": 657, "train/total_loss": 0.2496946156024933 }, { "entropy": 9.511025428771973, "epoch": 0.06505833498121416, "mean_token_accuracy": 0.7278989553451538, "num_tokens": 3626661.0, "step": 658, "train/ce_loss": 1.0681073665618896 }, { "epoch": 0.06505833498121416, "step": 658, "train/sim_loss": 0.0703125 }, { "epoch": 0.06505833498121416, "step": 658, "train/total_loss": 0.17712324857711792 }, { "entropy": 9.375175476074219, "epoch": 0.06515720783072969, "mean_token_accuracy": 0.725053071975708, "num_tokens": 3632210.0, "step": 659, "train/ce_loss": 0.5818920135498047 }, { "epoch": 0.06515720783072969, "step": 659, "train/sim_loss": 0.125 }, { "epoch": 0.06515720783072969, "step": 659, "train/total_loss": 0.18318919837474823 }, { "epoch": 0.0652560806802452, "grad_norm": 1.1287105083465576, "learning_rate": 9.83953913860456e-06, "loss": 0.1875, "step": 660 }, { "entropy": 9.311573028564453, "epoch": 0.0652560806802452, "mean_token_accuracy": 0.7591792941093445, "num_tokens": 3637778.0, "step": 660, "train/ce_loss": 0.5080028176307678 }, { "epoch": 0.0652560806802452, "step": 660, "train/sim_loss": 0.07421875 }, { "epoch": 0.0652560806802452, "step": 660, "train/total_loss": 0.12501902878284454 }, { "entropy": 9.416850090026855, "epoch": 0.06535495352976073, "mean_token_accuracy": 0.6426966190338135, "num_tokens": 3643253.0, "step": 661, "train/ce_loss": 1.726518154144287 }, { "epoch": 0.06535495352976073, "step": 661, "train/sim_loss": 0.14453125 }, { "epoch": 0.06535495352976073, "step": 661, "train/total_loss": 0.31718307733535767 }, { "entropy": 9.61324405670166, "epoch": 0.06545382637927626, "mean_token_accuracy": 0.743790864944458, "num_tokens": 3648640.0, "step": 662, "train/ce_loss": 0.7922841906547546 }, { "epoch": 0.06545382637927626, "step": 662, "train/sim_loss": 0.1015625 }, { "epoch": 0.06545382637927626, "step": 662, "train/total_loss": 0.18079093098640442 }, { "entropy": 9.627519607543945, "epoch": 0.06555269922879177, "mean_token_accuracy": 0.7205479741096497, "num_tokens": 3653993.0, "step": 663, "train/ce_loss": 1.2993699312210083 }, { "epoch": 0.06555269922879177, "step": 663, "train/sim_loss": 0.140625 }, { "epoch": 0.06555269922879177, "step": 663, "train/total_loss": 0.27056199312210083 }, { "entropy": 9.531505584716797, "epoch": 0.0656515720783073, "mean_token_accuracy": 0.7467300891876221, "num_tokens": 3659461.0, "step": 664, "train/ce_loss": 1.2779372930526733 }, { "epoch": 0.0656515720783073, "step": 664, "train/sim_loss": 0.1015625 }, { "epoch": 0.0656515720783073, "step": 664, "train/total_loss": 0.22935622930526733 }, { "entropy": 9.102359771728516, "epoch": 0.06575044492782282, "mean_token_accuracy": 0.7287449240684509, "num_tokens": 3665113.0, "step": 665, "train/ce_loss": 1.0363781452178955 }, { "epoch": 0.06575044492782282, "step": 665, "train/sim_loss": 0.1796875 }, { "epoch": 0.06575044492782282, "step": 665, "train/total_loss": 0.28332531452178955 }, { "entropy": 9.154924392700195, "epoch": 0.06584931777733834, "mean_token_accuracy": 0.7733089327812195, "num_tokens": 3670787.0, "step": 666, "train/ce_loss": 0.6397537589073181 }, { "epoch": 0.06584931777733834, "step": 666, "train/sim_loss": 0.11328125 }, { "epoch": 0.06584931777733834, "step": 666, "train/total_loss": 0.17725662887096405 }, { "entropy": 9.584300994873047, "epoch": 0.06594819062685386, "mean_token_accuracy": 0.7572815418243408, "num_tokens": 3676220.0, "step": 667, "train/ce_loss": 0.7688289880752563 }, { "epoch": 0.06594819062685386, "step": 667, "train/sim_loss": 0.0625 }, { "epoch": 0.06594819062685386, "step": 667, "train/total_loss": 0.13938289880752563 }, { "entropy": 9.596839904785156, "epoch": 0.06604706347636939, "mean_token_accuracy": 0.7130952477455139, "num_tokens": 3681645.0, "step": 668, "train/ce_loss": 1.1169962882995605 }, { "epoch": 0.06604706347636939, "step": 668, "train/sim_loss": 0.09375 }, { "epoch": 0.06604706347636939, "step": 668, "train/total_loss": 0.205449640750885 }, { "entropy": 9.335253715515137, "epoch": 0.06614593632588492, "mean_token_accuracy": 0.7351064085960388, "num_tokens": 3687067.0, "step": 669, "train/ce_loss": 0.5312085747718811 }, { "epoch": 0.06614593632588492, "step": 669, "train/sim_loss": 0.0390625 }, { "epoch": 0.06614593632588492, "step": 669, "train/total_loss": 0.09218335896730423 }, { "entropy": 9.352314949035645, "epoch": 0.06624480917540043, "mean_token_accuracy": 0.7630234956741333, "num_tokens": 3692676.0, "step": 670, "train/ce_loss": 0.9074511528015137 }, { "epoch": 0.06624480917540043, "step": 670, "train/sim_loss": 0.0703125 }, { "epoch": 0.06624480917540043, "step": 670, "train/total_loss": 0.16105762124061584 }, { "entropy": 9.275580406188965, "epoch": 0.06634368202491596, "mean_token_accuracy": 0.7044609785079956, "num_tokens": 3698287.0, "step": 671, "train/ce_loss": 0.6878821849822998 }, { "epoch": 0.06634368202491596, "step": 671, "train/sim_loss": 0.06640625 }, { "epoch": 0.06634368202491596, "step": 671, "train/total_loss": 0.13519448041915894 }, { "entropy": 9.360115051269531, "epoch": 0.06644255487443149, "mean_token_accuracy": 0.6899902820587158, "num_tokens": 3703913.0, "step": 672, "train/ce_loss": 0.7785911560058594 }, { "epoch": 0.06644255487443149, "step": 672, "train/sim_loss": 0.1640625 }, { "epoch": 0.06644255487443149, "step": 672, "train/total_loss": 0.24192161858081818 }, { "entropy": 9.135751724243164, "epoch": 0.066541427723947, "mean_token_accuracy": 0.7763023376464844, "num_tokens": 3709498.0, "step": 673, "train/ce_loss": 0.6357479095458984 }, { "epoch": 0.066541427723947, "step": 673, "train/sim_loss": 0.0546875 }, { "epoch": 0.066541427723947, "step": 673, "train/total_loss": 0.11826229095458984 }, { "entropy": 9.4376802444458, "epoch": 0.06664030057346253, "mean_token_accuracy": 0.7041564583778381, "num_tokens": 3714905.0, "step": 674, "train/ce_loss": 1.1817408800125122 }, { "epoch": 0.06664030057346253, "step": 674, "train/sim_loss": 0.09375 }, { "epoch": 0.06664030057346253, "step": 674, "train/total_loss": 0.21192409098148346 }, { "entropy": 9.502941131591797, "epoch": 0.06673917342297805, "mean_token_accuracy": 0.6962025165557861, "num_tokens": 3720438.0, "step": 675, "train/ce_loss": 0.518011748790741 }, { "epoch": 0.06673917342297805, "step": 675, "train/sim_loss": 0.109375 }, { "epoch": 0.06673917342297805, "step": 675, "train/total_loss": 0.1611761748790741 }, { "entropy": 9.702911376953125, "epoch": 0.06683804627249357, "mean_token_accuracy": 0.7694370150566101, "num_tokens": 3725824.0, "step": 676, "train/ce_loss": 0.7949889898300171 }, { "epoch": 0.06683804627249357, "step": 676, "train/sim_loss": 0.109375 }, { "epoch": 0.06683804627249357, "step": 676, "train/total_loss": 0.18887390196323395 }, { "entropy": 9.342824935913086, "epoch": 0.0669369191220091, "mean_token_accuracy": 0.7887324094772339, "num_tokens": 3731303.0, "step": 677, "train/ce_loss": 0.6587103009223938 }, { "epoch": 0.0669369191220091, "step": 677, "train/sim_loss": 0.0390625 }, { "epoch": 0.0669369191220091, "step": 677, "train/total_loss": 0.10493353009223938 }, { "entropy": 9.296541213989258, "epoch": 0.06703579197152462, "mean_token_accuracy": 0.7418181896209717, "num_tokens": 3736759.0, "step": 678, "train/ce_loss": 0.5438504219055176 }, { "epoch": 0.06703579197152462, "step": 678, "train/sim_loss": 0.0546875 }, { "epoch": 0.06703579197152462, "step": 678, "train/total_loss": 0.10907254368066788 }, { "entropy": 9.778236389160156, "epoch": 0.06713466482104015, "mean_token_accuracy": 0.7172414064407349, "num_tokens": 3742017.0, "step": 679, "train/ce_loss": 0.9712975025177002 }, { "epoch": 0.06713466482104015, "step": 679, "train/sim_loss": 0.09375 }, { "epoch": 0.06713466482104015, "step": 679, "train/total_loss": 0.19087976217269897 }, { "epoch": 0.06723353767055566, "grad_norm": 1.0204699039459229, "learning_rate": 9.834594273846612e-06, "loss": 0.1811, "step": 680 }, { "entropy": 9.486417770385742, "epoch": 0.06723353767055566, "mean_token_accuracy": 0.7508611083030701, "num_tokens": 3747492.0, "step": 680, "train/ce_loss": 0.8175002336502075 }, { "epoch": 0.06723353767055566, "step": 680, "train/sim_loss": 0.109375 }, { "epoch": 0.06723353767055566, "step": 680, "train/total_loss": 0.1911250352859497 }, { "entropy": 9.231573104858398, "epoch": 0.06733241052007119, "mean_token_accuracy": 0.7098121047019958, "num_tokens": 3753090.0, "step": 681, "train/ce_loss": 0.47511371970176697 }, { "epoch": 0.06733241052007119, "step": 681, "train/sim_loss": 0.10546875 }, { "epoch": 0.06733241052007119, "step": 681, "train/total_loss": 0.15298011898994446 }, { "entropy": 9.65233325958252, "epoch": 0.06743128336958672, "mean_token_accuracy": 0.6980891823768616, "num_tokens": 3758473.0, "step": 682, "train/ce_loss": 1.5086872577667236 }, { "epoch": 0.06743128336958672, "step": 682, "train/sim_loss": 0.12109375 }, { "epoch": 0.06743128336958672, "step": 682, "train/total_loss": 0.2719624638557434 }, { "entropy": 8.684928894042969, "epoch": 0.06753015621910223, "mean_token_accuracy": 0.6872074604034424, "num_tokens": 3764348.0, "step": 683, "train/ce_loss": 0.3920105993747711 }, { "epoch": 0.06753015621910223, "step": 683, "train/sim_loss": 0.125 }, { "epoch": 0.06753015621910223, "step": 683, "train/total_loss": 0.1642010658979416 }, { "entropy": 9.272201538085938, "epoch": 0.06762902906861776, "mean_token_accuracy": 0.7466802597045898, "num_tokens": 3769911.0, "step": 684, "train/ce_loss": 0.7470539212226868 }, { "epoch": 0.06762902906861776, "step": 684, "train/sim_loss": 0.09375 }, { "epoch": 0.06762902906861776, "step": 684, "train/total_loss": 0.16845539212226868 }, { "entropy": 9.381033897399902, "epoch": 0.06772790191813328, "mean_token_accuracy": 0.7472661137580872, "num_tokens": 3775328.0, "step": 685, "train/ce_loss": 0.846916913986206 }, { "epoch": 0.06772790191813328, "step": 685, "train/sim_loss": 0.078125 }, { "epoch": 0.06772790191813328, "step": 685, "train/total_loss": 0.16281670331954956 }, { "entropy": 9.447794914245605, "epoch": 0.0678267747676488, "mean_token_accuracy": 0.7219069004058838, "num_tokens": 3780804.0, "step": 686, "train/ce_loss": 0.6851433515548706 }, { "epoch": 0.0678267747676488, "step": 686, "train/sim_loss": 0.0546875 }, { "epoch": 0.0678267747676488, "step": 686, "train/total_loss": 0.12320183962583542 }, { "entropy": 9.542617797851562, "epoch": 0.06792564761716433, "mean_token_accuracy": 0.7446808218955994, "num_tokens": 3786403.0, "step": 687, "train/ce_loss": 0.9808657169342041 }, { "epoch": 0.06792564761716433, "step": 687, "train/sim_loss": 0.1171875 }, { "epoch": 0.06792564761716433, "step": 687, "train/total_loss": 0.21527406573295593 }, { "entropy": 9.524702072143555, "epoch": 0.06802452046667985, "mean_token_accuracy": 0.7306666374206543, "num_tokens": 3791742.0, "step": 688, "train/ce_loss": 1.390016794204712 }, { "epoch": 0.06802452046667985, "step": 688, "train/sim_loss": 0.1953125 }, { "epoch": 0.06802452046667985, "step": 688, "train/total_loss": 0.33431416749954224 }, { "entropy": 9.163127899169922, "epoch": 0.06812339331619537, "mean_token_accuracy": 0.780061662197113, "num_tokens": 3797394.0, "step": 689, "train/ce_loss": 0.9482147693634033 }, { "epoch": 0.06812339331619537, "step": 689, "train/sim_loss": 0.1171875 }, { "epoch": 0.06812339331619537, "step": 689, "train/total_loss": 0.2120089828968048 }, { "entropy": 9.355783462524414, "epoch": 0.0682222661657109, "mean_token_accuracy": 0.7057115435600281, "num_tokens": 3802887.0, "step": 690, "train/ce_loss": 0.4403047561645508 }, { "epoch": 0.0682222661657109, "step": 690, "train/sim_loss": 0.0625 }, { "epoch": 0.0682222661657109, "step": 690, "train/total_loss": 0.10653047263622284 }, { "entropy": 9.492921829223633, "epoch": 0.06832113901522642, "mean_token_accuracy": 0.7273781895637512, "num_tokens": 3808404.0, "step": 691, "train/ce_loss": 1.2281547784805298 }, { "epoch": 0.06832113901522642, "step": 691, "train/sim_loss": 0.125 }, { "epoch": 0.06832113901522642, "step": 691, "train/total_loss": 0.24781548976898193 }, { "entropy": 9.170289993286133, "epoch": 0.06842001186474195, "mean_token_accuracy": 0.7075743079185486, "num_tokens": 3814008.0, "step": 692, "train/ce_loss": 0.6834650039672852 }, { "epoch": 0.06842001186474195, "step": 692, "train/sim_loss": 0.0625 }, { "epoch": 0.06842001186474195, "step": 692, "train/total_loss": 0.13084650039672852 }, { "entropy": 9.327987670898438, "epoch": 0.06851888471425746, "mean_token_accuracy": 0.7552836537361145, "num_tokens": 3819481.0, "step": 693, "train/ce_loss": 0.9806194305419922 }, { "epoch": 0.06851888471425746, "step": 693, "train/sim_loss": 0.14453125 }, { "epoch": 0.06851888471425746, "step": 693, "train/total_loss": 0.2425931990146637 }, { "entropy": 9.495173454284668, "epoch": 0.06861775756377299, "mean_token_accuracy": 0.7209567427635193, "num_tokens": 3824991.0, "step": 694, "train/ce_loss": 1.103578805923462 }, { "epoch": 0.06861775756377299, "step": 694, "train/sim_loss": 0.1328125 }, { "epoch": 0.06861775756377299, "step": 694, "train/total_loss": 0.2431703805923462 }, { "entropy": 9.77934455871582, "epoch": 0.06871663041328852, "mean_token_accuracy": 0.7440890073776245, "num_tokens": 3830220.0, "step": 695, "train/ce_loss": 0.6771624088287354 }, { "epoch": 0.06871663041328852, "step": 695, "train/sim_loss": 0.0390625 }, { "epoch": 0.06871663041328852, "step": 695, "train/total_loss": 0.10677874088287354 }, { "entropy": 9.291604995727539, "epoch": 0.06881550326280403, "mean_token_accuracy": 0.6825902462005615, "num_tokens": 3835773.0, "step": 696, "train/ce_loss": 0.7181695103645325 }, { "epoch": 0.06881550326280403, "step": 696, "train/sim_loss": 0.08203125 }, { "epoch": 0.06881550326280403, "step": 696, "train/total_loss": 0.15384820103645325 }, { "entropy": 9.283123016357422, "epoch": 0.06891437611231956, "mean_token_accuracy": 0.7454349994659424, "num_tokens": 3841311.0, "step": 697, "train/ce_loss": 0.3572738468647003 }, { "epoch": 0.06891437611231956, "step": 697, "train/sim_loss": 0.09765625 }, { "epoch": 0.06891437611231956, "step": 697, "train/total_loss": 0.1333836317062378 }, { "entropy": 9.437932968139648, "epoch": 0.06901324896183508, "mean_token_accuracy": 0.6726618409156799, "num_tokens": 3846753.0, "step": 698, "train/ce_loss": 1.6572483777999878 }, { "epoch": 0.06901324896183508, "step": 698, "train/sim_loss": 0.11328125 }, { "epoch": 0.06901324896183508, "step": 698, "train/total_loss": 0.27900609374046326 }, { "entropy": 9.614707946777344, "epoch": 0.0691121218113506, "mean_token_accuracy": 0.7485795617103577, "num_tokens": 3852082.0, "step": 699, "train/ce_loss": 0.5013539791107178 }, { "epoch": 0.0691121218113506, "step": 699, "train/sim_loss": 0.07421875 }, { "epoch": 0.0691121218113506, "step": 699, "train/total_loss": 0.12435415387153625 }, { "epoch": 0.06921099466086612, "grad_norm": 1.4260398149490356, "learning_rate": 9.829649409088662e-06, "loss": 0.1916, "step": 700 }, { "entropy": 9.275681495666504, "epoch": 0.06921099466086612, "mean_token_accuracy": 0.7400611639022827, "num_tokens": 3857648.0, "step": 700, "train/ce_loss": 0.8091702461242676 }, { "epoch": 0.06921099466086612, "step": 700, "train/sim_loss": 0.09765625 }, { "epoch": 0.06921099466086612, "step": 700, "train/total_loss": 0.17857328057289124 }, { "entropy": 9.429615020751953, "epoch": 0.06930986751038165, "mean_token_accuracy": 0.6991676688194275, "num_tokens": 3863122.0, "step": 701, "train/ce_loss": 0.9984180331230164 }, { "epoch": 0.06930986751038165, "step": 701, "train/sim_loss": 0.140625 }, { "epoch": 0.06930986751038165, "step": 701, "train/total_loss": 0.24046680331230164 }, { "entropy": 9.091985702514648, "epoch": 0.06940874035989718, "mean_token_accuracy": 0.7417530417442322, "num_tokens": 3868838.0, "step": 702, "train/ce_loss": 0.46978938579559326 }, { "epoch": 0.06940874035989718, "step": 702, "train/sim_loss": 0.05859375 }, { "epoch": 0.06940874035989718, "step": 702, "train/total_loss": 0.10557268559932709 }, { "entropy": 9.387910842895508, "epoch": 0.06950761320941269, "mean_token_accuracy": 0.7781493663787842, "num_tokens": 3874349.0, "step": 703, "train/ce_loss": 0.5985023975372314 }, { "epoch": 0.06950761320941269, "step": 703, "train/sim_loss": 0.046875 }, { "epoch": 0.06950761320941269, "step": 703, "train/total_loss": 0.10672524571418762 }, { "entropy": 9.718038558959961, "epoch": 0.06960648605892822, "mean_token_accuracy": 0.6816860437393188, "num_tokens": 3879617.0, "step": 704, "train/ce_loss": 0.7745460867881775 }, { "epoch": 0.06960648605892822, "step": 704, "train/sim_loss": 0.0859375 }, { "epoch": 0.06960648605892822, "step": 704, "train/total_loss": 0.16339211165905 }, { "entropy": 9.124296188354492, "epoch": 0.06970535890844375, "mean_token_accuracy": 0.7299339175224304, "num_tokens": 3885204.0, "step": 705, "train/ce_loss": 1.6264070272445679 }, { "epoch": 0.06970535890844375, "step": 705, "train/sim_loss": 0.0859375 }, { "epoch": 0.06970535890844375, "step": 705, "train/total_loss": 0.24857820570468903 }, { "entropy": 9.117471694946289, "epoch": 0.06980423175795926, "mean_token_accuracy": 0.6921707987785339, "num_tokens": 3890891.0, "step": 706, "train/ce_loss": 0.83110511302948 }, { "epoch": 0.06980423175795926, "step": 706, "train/sim_loss": 0.1328125 }, { "epoch": 0.06980423175795926, "step": 706, "train/total_loss": 0.215923011302948 }, { "entropy": 9.223295211791992, "epoch": 0.06990310460747479, "mean_token_accuracy": 0.7830092310905457, "num_tokens": 3896504.0, "step": 707, "train/ce_loss": 0.48391973972320557 }, { "epoch": 0.06990310460747479, "step": 707, "train/sim_loss": 0.04296875 }, { "epoch": 0.06990310460747479, "step": 707, "train/total_loss": 0.09136072546243668 }, { "entropy": 8.992353439331055, "epoch": 0.07000197745699031, "mean_token_accuracy": 0.752212405204773, "num_tokens": 3902198.0, "step": 708, "train/ce_loss": 0.8430083990097046 }, { "epoch": 0.07000197745699031, "step": 708, "train/sim_loss": 0.03515625 }, { "epoch": 0.07000197745699031, "step": 708, "train/total_loss": 0.11945708841085434 }, { "entropy": 9.71152400970459, "epoch": 0.07010085030650583, "mean_token_accuracy": 0.7196969985961914, "num_tokens": 3907682.0, "step": 709, "train/ce_loss": 0.45836755633354187 }, { "epoch": 0.07010085030650583, "step": 709, "train/sim_loss": 0.18359375 }, { "epoch": 0.07010085030650583, "step": 709, "train/total_loss": 0.22943051159381866 }, { "entropy": 9.51969051361084, "epoch": 0.07019972315602135, "mean_token_accuracy": 0.714634120464325, "num_tokens": 3913126.0, "step": 710, "train/ce_loss": 0.49202266335487366 }, { "epoch": 0.07019972315602135, "step": 710, "train/sim_loss": 0.09375 }, { "epoch": 0.07019972315602135, "step": 710, "train/total_loss": 0.14295226335525513 }, { "entropy": 9.568197250366211, "epoch": 0.07029859600553688, "mean_token_accuracy": 0.7410160899162292, "num_tokens": 3918517.0, "step": 711, "train/ce_loss": 0.5910638570785522 }, { "epoch": 0.07029859600553688, "step": 711, "train/sim_loss": 0.0859375 }, { "epoch": 0.07029859600553688, "step": 711, "train/total_loss": 0.14504387974739075 }, { "entropy": 9.537040710449219, "epoch": 0.07039746885505241, "mean_token_accuracy": 0.7282878160476685, "num_tokens": 3923918.0, "step": 712, "train/ce_loss": 0.9542323350906372 }, { "epoch": 0.07039746885505241, "step": 712, "train/sim_loss": 0.140625 }, { "epoch": 0.07039746885505241, "step": 712, "train/total_loss": 0.23604823648929596 }, { "entropy": 9.379705429077148, "epoch": 0.07049634170456792, "mean_token_accuracy": 0.7024503946304321, "num_tokens": 3929406.0, "step": 713, "train/ce_loss": 0.3697109818458557 }, { "epoch": 0.07049634170456792, "step": 713, "train/sim_loss": 0.08984375 }, { "epoch": 0.07049634170456792, "step": 713, "train/total_loss": 0.1268148422241211 }, { "entropy": 9.2758150100708, "epoch": 0.07059521455408345, "mean_token_accuracy": 0.7993197441101074, "num_tokens": 3934860.0, "step": 714, "train/ce_loss": 0.5814694762229919 }, { "epoch": 0.07059521455408345, "step": 714, "train/sim_loss": 0.078125 }, { "epoch": 0.07059521455408345, "step": 714, "train/total_loss": 0.13627195358276367 }, { "entropy": 9.488122940063477, "epoch": 0.07069408740359898, "mean_token_accuracy": 0.7338129281997681, "num_tokens": 3940271.0, "step": 715, "train/ce_loss": 0.767157256603241 }, { "epoch": 0.07069408740359898, "step": 715, "train/sim_loss": 0.0546875 }, { "epoch": 0.07069408740359898, "step": 715, "train/total_loss": 0.13140323758125305 }, { "entropy": 9.209687232971191, "epoch": 0.07079296025311449, "mean_token_accuracy": 0.7332636117935181, "num_tokens": 3945850.0, "step": 716, "train/ce_loss": 0.8752963542938232 }, { "epoch": 0.07079296025311449, "step": 716, "train/sim_loss": 0.11328125 }, { "epoch": 0.07079296025311449, "step": 716, "train/total_loss": 0.20081087946891785 }, { "entropy": 9.366090774536133, "epoch": 0.07089183310263002, "mean_token_accuracy": 0.7263948321342468, "num_tokens": 3951382.0, "step": 717, "train/ce_loss": 0.6038789749145508 }, { "epoch": 0.07089183310263002, "step": 717, "train/sim_loss": 0.09375 }, { "epoch": 0.07089183310263002, "step": 717, "train/total_loss": 0.15413789451122284 }, { "entropy": 9.418317794799805, "epoch": 0.07099070595214554, "mean_token_accuracy": 0.7037484645843506, "num_tokens": 3956794.0, "step": 718, "train/ce_loss": 0.7623369097709656 }, { "epoch": 0.07099070595214554, "step": 718, "train/sim_loss": 0.08984375 }, { "epoch": 0.07099070595214554, "step": 718, "train/total_loss": 0.16607743501663208 }, { "entropy": 9.404210090637207, "epoch": 0.07108957880166106, "mean_token_accuracy": 0.74387526512146, "num_tokens": 3962342.0, "step": 719, "train/ce_loss": 0.8346834778785706 }, { "epoch": 0.07108957880166106, "step": 719, "train/sim_loss": 0.11328125 }, { "epoch": 0.07108957880166106, "step": 719, "train/total_loss": 0.19674959778785706 }, { "epoch": 0.07118845165117658, "grad_norm": 1.2558749914169312, "learning_rate": 9.824704544330713e-06, "loss": 0.1898, "step": 720 }, { "entropy": 9.238387107849121, "epoch": 0.07118845165117658, "mean_token_accuracy": 0.7751045823097229, "num_tokens": 3967966.0, "step": 720, "train/ce_loss": 0.8425070643424988 }, { "epoch": 0.07118845165117658, "step": 720, "train/sim_loss": 0.15234375 }, { "epoch": 0.07118845165117658, "step": 720, "train/total_loss": 0.23659446835517883 }, { "entropy": 9.619183540344238, "epoch": 0.07128732450069211, "mean_token_accuracy": 0.727642297744751, "num_tokens": 3973304.0, "step": 721, "train/ce_loss": 1.2830555438995361 }, { "epoch": 0.07128732450069211, "step": 721, "train/sim_loss": 0.13671875 }, { "epoch": 0.07128732450069211, "step": 721, "train/total_loss": 0.2650243043899536 }, { "entropy": 9.652593612670898, "epoch": 0.07138619735020764, "mean_token_accuracy": 0.7782546281814575, "num_tokens": 3978671.0, "step": 722, "train/ce_loss": 0.8905591368675232 }, { "epoch": 0.07138619735020764, "step": 722, "train/sim_loss": 0.05078125 }, { "epoch": 0.07138619735020764, "step": 722, "train/total_loss": 0.13983717560768127 }, { "entropy": 9.451098442077637, "epoch": 0.07148507019972315, "mean_token_accuracy": 0.7461024522781372, "num_tokens": 3984173.0, "step": 723, "train/ce_loss": 0.7693148851394653 }, { "epoch": 0.07148507019972315, "step": 723, "train/sim_loss": 0.1015625 }, { "epoch": 0.07148507019972315, "step": 723, "train/total_loss": 0.17849399149417877 }, { "entropy": 9.739234924316406, "epoch": 0.07158394304923868, "mean_token_accuracy": 0.7486772537231445, "num_tokens": 3989493.0, "step": 724, "train/ce_loss": 0.9678433537483215 }, { "epoch": 0.07158394304923868, "step": 724, "train/sim_loss": 0.06640625 }, { "epoch": 0.07158394304923868, "step": 724, "train/total_loss": 0.1631905883550644 }, { "entropy": 9.534040451049805, "epoch": 0.0716828158987542, "mean_token_accuracy": 0.7540172934532166, "num_tokens": 3994965.0, "step": 725, "train/ce_loss": 0.6165555119514465 }, { "epoch": 0.0716828158987542, "step": 725, "train/sim_loss": 0.09765625 }, { "epoch": 0.0716828158987542, "step": 725, "train/total_loss": 0.15931180119514465 }, { "entropy": 9.506189346313477, "epoch": 0.07178168874826972, "mean_token_accuracy": 0.7596153616905212, "num_tokens": 4000346.0, "step": 726, "train/ce_loss": 0.9203923940658569 }, { "epoch": 0.07178168874826972, "step": 726, "train/sim_loss": 0.11328125 }, { "epoch": 0.07178168874826972, "step": 726, "train/total_loss": 0.20532049238681793 }, { "entropy": 9.231067657470703, "epoch": 0.07188056159778525, "mean_token_accuracy": 0.7422885298728943, "num_tokens": 4005860.0, "step": 727, "train/ce_loss": 0.4060293138027191 }, { "epoch": 0.07188056159778525, "step": 727, "train/sim_loss": 0.0546875 }, { "epoch": 0.07188056159778525, "step": 727, "train/total_loss": 0.09529043734073639 }, { "entropy": 9.291755676269531, "epoch": 0.07197943444730077, "mean_token_accuracy": 0.7961053848266602, "num_tokens": 4011414.0, "step": 728, "train/ce_loss": 0.44549763202667236 }, { "epoch": 0.07197943444730077, "step": 728, "train/sim_loss": 0.0859375 }, { "epoch": 0.07197943444730077, "step": 728, "train/total_loss": 0.13048726320266724 }, { "entropy": 9.513713836669922, "epoch": 0.07207830729681629, "mean_token_accuracy": 0.7631579041481018, "num_tokens": 4016841.0, "step": 729, "train/ce_loss": 0.5615242123603821 }, { "epoch": 0.07207830729681629, "step": 729, "train/sim_loss": 0.05859375 }, { "epoch": 0.07207830729681629, "step": 729, "train/total_loss": 0.11474616825580597 }, { "entropy": 9.216581344604492, "epoch": 0.07217718014633182, "mean_token_accuracy": 0.7216828465461731, "num_tokens": 4022468.0, "step": 730, "train/ce_loss": 0.6993263959884644 }, { "epoch": 0.07217718014633182, "step": 730, "train/sim_loss": 0.078125 }, { "epoch": 0.07217718014633182, "step": 730, "train/total_loss": 0.14805763959884644 }, { "entropy": 8.961934089660645, "epoch": 0.07227605299584734, "mean_token_accuracy": 0.7108225226402283, "num_tokens": 4028253.0, "step": 731, "train/ce_loss": 0.8354874849319458 }, { "epoch": 0.07227605299584734, "step": 731, "train/sim_loss": 0.0859375 }, { "epoch": 0.07227605299584734, "step": 731, "train/total_loss": 0.16948625445365906 }, { "entropy": 9.435766220092773, "epoch": 0.07237492584536287, "mean_token_accuracy": 0.7233333587646484, "num_tokens": 4033705.0, "step": 732, "train/ce_loss": 1.094090223312378 }, { "epoch": 0.07237492584536287, "step": 732, "train/sim_loss": 0.07421875 }, { "epoch": 0.07237492584536287, "step": 732, "train/total_loss": 0.18362778425216675 }, { "entropy": 9.601432800292969, "epoch": 0.07247379869487838, "mean_token_accuracy": 0.7481012940406799, "num_tokens": 4039101.0, "step": 733, "train/ce_loss": 0.7236177325248718 }, { "epoch": 0.07247379869487838, "step": 733, "train/sim_loss": 0.08203125 }, { "epoch": 0.07247379869487838, "step": 733, "train/total_loss": 0.1543930172920227 }, { "entropy": 9.503561973571777, "epoch": 0.07257267154439391, "mean_token_accuracy": 0.7182940244674683, "num_tokens": 4044598.0, "step": 734, "train/ce_loss": 0.8839106559753418 }, { "epoch": 0.07257267154439391, "step": 734, "train/sim_loss": 0.14453125 }, { "epoch": 0.07257267154439391, "step": 734, "train/total_loss": 0.23292231559753418 }, { "entropy": 9.686687469482422, "epoch": 0.07267154439390944, "mean_token_accuracy": 0.805232584476471, "num_tokens": 4049959.0, "step": 735, "train/ce_loss": 0.7191175222396851 }, { "epoch": 0.07267154439390944, "step": 735, "train/sim_loss": 0.06640625 }, { "epoch": 0.07267154439390944, "step": 735, "train/total_loss": 0.1383180022239685 }, { "entropy": 9.5287446975708, "epoch": 0.07277041724342495, "mean_token_accuracy": 0.7430379986763, "num_tokens": 4055408.0, "step": 736, "train/ce_loss": 0.923762321472168 }, { "epoch": 0.07277041724342495, "step": 736, "train/sim_loss": 0.12109375 }, { "epoch": 0.07277041724342495, "step": 736, "train/total_loss": 0.2134699821472168 }, { "entropy": 9.162788391113281, "epoch": 0.07286929009294048, "mean_token_accuracy": 0.7289048433303833, "num_tokens": 4060981.0, "step": 737, "train/ce_loss": 0.5855153203010559 }, { "epoch": 0.07286929009294048, "step": 737, "train/sim_loss": 0.0546875 }, { "epoch": 0.07286929009294048, "step": 737, "train/total_loss": 0.11323903501033783 }, { "entropy": 9.36433219909668, "epoch": 0.072968162942456, "mean_token_accuracy": 0.7842227220535278, "num_tokens": 4066502.0, "step": 738, "train/ce_loss": 0.5373504757881165 }, { "epoch": 0.072968162942456, "step": 738, "train/sim_loss": 0.04296875 }, { "epoch": 0.072968162942456, "step": 738, "train/total_loss": 0.09670379757881165 }, { "entropy": 9.457647323608398, "epoch": 0.07306703579197152, "mean_token_accuracy": 0.6925638318061829, "num_tokens": 4071935.0, "step": 739, "train/ce_loss": 0.8400789499282837 }, { "epoch": 0.07306703579197152, "step": 739, "train/sim_loss": 0.09375 }, { "epoch": 0.07306703579197152, "step": 739, "train/total_loss": 0.1777578890323639 }, { "epoch": 0.07316590864148705, "grad_norm": 1.329858660697937, "learning_rate": 9.819759679572765e-06, "loss": 0.1795, "step": 740 }, { "entropy": 9.37187385559082, "epoch": 0.07316590864148705, "mean_token_accuracy": 0.710671067237854, "num_tokens": 4077491.0, "step": 740, "train/ce_loss": 0.8023810386657715 }, { "epoch": 0.07316590864148705, "step": 740, "train/sim_loss": 0.08203125 }, { "epoch": 0.07316590864148705, "step": 740, "train/total_loss": 0.16226935386657715 }, { "entropy": 9.382226943969727, "epoch": 0.07326478149100257, "mean_token_accuracy": 0.7409988641738892, "num_tokens": 4082981.0, "step": 741, "train/ce_loss": 0.5812582969665527 }, { "epoch": 0.07326478149100257, "step": 741, "train/sim_loss": 0.08203125 }, { "epoch": 0.07326478149100257, "step": 741, "train/total_loss": 0.1401570737361908 }, { "entropy": 9.37686824798584, "epoch": 0.0733636543405181, "mean_token_accuracy": 0.7863534688949585, "num_tokens": 4088657.0, "step": 742, "train/ce_loss": 0.6852748990058899 }, { "epoch": 0.0733636543405181, "step": 742, "train/sim_loss": 0.06640625 }, { "epoch": 0.0733636543405181, "step": 742, "train/total_loss": 0.134933739900589 }, { "entropy": 9.333158493041992, "epoch": 0.07346252719003361, "mean_token_accuracy": 0.7448359727859497, "num_tokens": 4094059.0, "step": 743, "train/ce_loss": 0.4334511458873749 }, { "epoch": 0.07346252719003361, "step": 743, "train/sim_loss": 0.08203125 }, { "epoch": 0.07346252719003361, "step": 743, "train/total_loss": 0.1253763735294342 }, { "entropy": 9.017110824584961, "epoch": 0.07356140003954914, "mean_token_accuracy": 0.7955056428909302, "num_tokens": 4099575.0, "step": 744, "train/ce_loss": 0.7083708643913269 }, { "epoch": 0.07356140003954914, "step": 744, "train/sim_loss": 0.06640625 }, { "epoch": 0.07356140003954914, "step": 744, "train/total_loss": 0.1372433304786682 }, { "entropy": 9.30793571472168, "epoch": 0.07366027288906467, "mean_token_accuracy": 0.7178871631622314, "num_tokens": 4105048.0, "step": 745, "train/ce_loss": 0.6316896677017212 }, { "epoch": 0.07366027288906467, "step": 745, "train/sim_loss": 0.13671875 }, { "epoch": 0.07366027288906467, "step": 745, "train/total_loss": 0.1998877227306366 }, { "entropy": 9.282946586608887, "epoch": 0.07375914573858018, "mean_token_accuracy": 0.6824196577072144, "num_tokens": 4110719.0, "step": 746, "train/ce_loss": 1.4959665536880493 }, { "epoch": 0.07375914573858018, "step": 746, "train/sim_loss": 0.078125 }, { "epoch": 0.07375914573858018, "step": 746, "train/total_loss": 0.2277216613292694 }, { "entropy": 9.433370590209961, "epoch": 0.07385801858809571, "mean_token_accuracy": 0.7644171714782715, "num_tokens": 4116119.0, "step": 747, "train/ce_loss": 0.9679957032203674 }, { "epoch": 0.07385801858809571, "step": 747, "train/sim_loss": 0.1015625 }, { "epoch": 0.07385801858809571, "step": 747, "train/total_loss": 0.1983620822429657 }, { "entropy": 9.198036193847656, "epoch": 0.07395689143761124, "mean_token_accuracy": 0.7851851582527161, "num_tokens": 4121673.0, "step": 748, "train/ce_loss": 0.739772617816925 }, { "epoch": 0.07395689143761124, "step": 748, "train/sim_loss": 0.09765625 }, { "epoch": 0.07395689143761124, "step": 748, "train/total_loss": 0.1716335117816925 }, { "entropy": 9.615966796875, "epoch": 0.07405576428712675, "mean_token_accuracy": 0.7064676880836487, "num_tokens": 4127215.0, "step": 749, "train/ce_loss": 1.0176602602005005 }, { "epoch": 0.07405576428712675, "step": 749, "train/sim_loss": 0.140625 }, { "epoch": 0.07405576428712675, "step": 749, "train/total_loss": 0.24239102005958557 }, { "entropy": 9.371624946594238, "epoch": 0.07415463713664228, "mean_token_accuracy": 0.7633689641952515, "num_tokens": 4132483.0, "step": 750, "train/ce_loss": 0.9306820034980774 }, { "epoch": 0.07415463713664228, "step": 750, "train/sim_loss": 0.1171875 }, { "epoch": 0.07415463713664228, "step": 750, "train/total_loss": 0.2102557122707367 }, { "entropy": 9.119059562683105, "epoch": 0.0742535099861578, "mean_token_accuracy": 0.7537993788719177, "num_tokens": 4138130.0, "step": 751, "train/ce_loss": 0.4563278257846832 }, { "epoch": 0.0742535099861578, "step": 751, "train/sim_loss": 0.09765625 }, { "epoch": 0.0742535099861578, "step": 751, "train/total_loss": 0.14328902959823608 }, { "entropy": 9.568120956420898, "epoch": 0.07435238283567333, "mean_token_accuracy": 0.7020202279090881, "num_tokens": 4143585.0, "step": 752, "train/ce_loss": 1.2962545156478882 }, { "epoch": 0.07435238283567333, "step": 752, "train/sim_loss": 0.12109375 }, { "epoch": 0.07435238283567333, "step": 752, "train/total_loss": 0.25071918964385986 }, { "entropy": 9.344210624694824, "epoch": 0.07445125568518884, "mean_token_accuracy": 0.797802209854126, "num_tokens": 4149129.0, "step": 753, "train/ce_loss": 0.8565330505371094 }, { "epoch": 0.07445125568518884, "step": 753, "train/sim_loss": 0.15625 }, { "epoch": 0.07445125568518884, "step": 753, "train/total_loss": 0.24190330505371094 }, { "entropy": 9.528658866882324, "epoch": 0.07455012853470437, "mean_token_accuracy": 0.7291428446769714, "num_tokens": 4154818.0, "step": 754, "train/ce_loss": 0.9806751012802124 }, { "epoch": 0.07455012853470437, "step": 754, "train/sim_loss": 0.1796875 }, { "epoch": 0.07455012853470437, "step": 754, "train/total_loss": 0.2777550220489502 }, { "entropy": 9.714971542358398, "epoch": 0.0746490013842199, "mean_token_accuracy": 0.791208803653717, "num_tokens": 4160124.0, "step": 755, "train/ce_loss": 0.5049983859062195 }, { "epoch": 0.0746490013842199, "step": 755, "train/sim_loss": 0.0546875 }, { "epoch": 0.0746490013842199, "step": 755, "train/total_loss": 0.10518734157085419 }, { "entropy": 9.546977996826172, "epoch": 0.07474787423373541, "mean_token_accuracy": 0.786649227142334, "num_tokens": 4165414.0, "step": 756, "train/ce_loss": 0.9412271976470947 }, { "epoch": 0.07474787423373541, "step": 756, "train/sim_loss": 0.10546875 }, { "epoch": 0.07474787423373541, "step": 756, "train/total_loss": 0.1995914727449417 }, { "entropy": 9.692506790161133, "epoch": 0.07484674708325094, "mean_token_accuracy": 0.7566137313842773, "num_tokens": 4170756.0, "step": 757, "train/ce_loss": 0.7826180458068848 }, { "epoch": 0.07484674708325094, "step": 757, "train/sim_loss": 0.1015625 }, { "epoch": 0.07484674708325094, "step": 757, "train/total_loss": 0.17982430756092072 }, { "entropy": 9.456832885742188, "epoch": 0.07494561993276647, "mean_token_accuracy": 0.7172897458076477, "num_tokens": 4176197.0, "step": 758, "train/ce_loss": 1.657883644104004 }, { "epoch": 0.07494561993276647, "step": 758, "train/sim_loss": 0.16015625 }, { "epoch": 0.07494561993276647, "step": 758, "train/total_loss": 0.32594460248947144 }, { "entropy": 9.358154296875, "epoch": 0.07504449278228198, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 4181727.0, "step": 759, "train/ce_loss": 1.2755345106124878 }, { "epoch": 0.07504449278228198, "step": 759, "train/sim_loss": 0.09375 }, { "epoch": 0.07504449278228198, "step": 759, "train/total_loss": 0.22130344808101654 }, { "epoch": 0.0751433656317975, "grad_norm": 1.0468997955322266, "learning_rate": 9.814814814814815e-06, "loss": 0.1795, "step": 760 }, { "entropy": 9.474666595458984, "epoch": 0.0751433656317975, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 4187142.0, "step": 760, "train/ce_loss": 0.7058255076408386 }, { "epoch": 0.0751433656317975, "step": 760, "train/sim_loss": 0.12890625 }, { "epoch": 0.0751433656317975, "step": 760, "train/total_loss": 0.1994888037443161 }, { "entropy": 9.319364547729492, "epoch": 0.07524223848131303, "mean_token_accuracy": 0.7554038763046265, "num_tokens": 4192695.0, "step": 761, "train/ce_loss": 0.8958869576454163 }, { "epoch": 0.07524223848131303, "step": 761, "train/sim_loss": 0.0390625 }, { "epoch": 0.07524223848131303, "step": 761, "train/total_loss": 0.1286512017250061 }, { "entropy": 9.056702613830566, "epoch": 0.07534111133082856, "mean_token_accuracy": 0.6752491593360901, "num_tokens": 4198623.0, "step": 762, "train/ce_loss": 0.6274706721305847 }, { "epoch": 0.07534111133082856, "step": 762, "train/sim_loss": 0.140625 }, { "epoch": 0.07534111133082856, "step": 762, "train/total_loss": 0.203372061252594 }, { "entropy": 9.05320930480957, "epoch": 0.07543998418034407, "mean_token_accuracy": 0.7255859375, "num_tokens": 4204311.0, "step": 763, "train/ce_loss": 0.7925764322280884 }, { "epoch": 0.07543998418034407, "step": 763, "train/sim_loss": 0.0234375 }, { "epoch": 0.07543998418034407, "step": 763, "train/total_loss": 0.10269514471292496 }, { "entropy": 9.27361011505127, "epoch": 0.0755388570298596, "mean_token_accuracy": 0.7014492750167847, "num_tokens": 4209943.0, "step": 764, "train/ce_loss": 0.9008075594902039 }, { "epoch": 0.0755388570298596, "step": 764, "train/sim_loss": 0.125 }, { "epoch": 0.0755388570298596, "step": 764, "train/total_loss": 0.21508076786994934 }, { "entropy": 9.392921447753906, "epoch": 0.07563772987937513, "mean_token_accuracy": 0.7166853547096252, "num_tokens": 4215278.0, "step": 765, "train/ce_loss": 0.9357327818870544 }, { "epoch": 0.07563772987937513, "step": 765, "train/sim_loss": 0.078125 }, { "epoch": 0.07563772987937513, "step": 765, "train/total_loss": 0.17169827222824097 }, { "entropy": 9.376035690307617, "epoch": 0.07573660272889064, "mean_token_accuracy": 0.735381543636322, "num_tokens": 4220748.0, "step": 766, "train/ce_loss": 0.7165777683258057 }, { "epoch": 0.07573660272889064, "step": 766, "train/sim_loss": 0.1171875 }, { "epoch": 0.07573660272889064, "step": 766, "train/total_loss": 0.18884527683258057 }, { "entropy": 9.752302169799805, "epoch": 0.07583547557840617, "mean_token_accuracy": 0.7475177049636841, "num_tokens": 4225962.0, "step": 767, "train/ce_loss": 0.5940572023391724 }, { "epoch": 0.07583547557840617, "step": 767, "train/sim_loss": 0.06640625 }, { "epoch": 0.07583547557840617, "step": 767, "train/total_loss": 0.12581196427345276 }, { "entropy": 9.297534942626953, "epoch": 0.0759343484279217, "mean_token_accuracy": 0.7605177760124207, "num_tokens": 4231495.0, "step": 768, "train/ce_loss": 0.6398865580558777 }, { "epoch": 0.0759343484279217, "step": 768, "train/sim_loss": 0.03515625 }, { "epoch": 0.0759343484279217, "step": 768, "train/total_loss": 0.09914490580558777 }, { "entropy": 9.505453109741211, "epoch": 0.07603322127743721, "mean_token_accuracy": 0.7800982594490051, "num_tokens": 4236899.0, "step": 769, "train/ce_loss": 0.5162529945373535 }, { "epoch": 0.07603322127743721, "step": 769, "train/sim_loss": 0.078125 }, { "epoch": 0.07603322127743721, "step": 769, "train/total_loss": 0.1297502964735031 }, { "entropy": 9.465841293334961, "epoch": 0.07613209412695274, "mean_token_accuracy": 0.7255370020866394, "num_tokens": 4242278.0, "step": 770, "train/ce_loss": 0.5750340223312378 }, { "epoch": 0.07613209412695274, "step": 770, "train/sim_loss": 0.0859375 }, { "epoch": 0.07613209412695274, "step": 770, "train/total_loss": 0.14344090223312378 }, { "entropy": 9.554760932922363, "epoch": 0.07623096697646826, "mean_token_accuracy": 0.7279503345489502, "num_tokens": 4247645.0, "step": 771, "train/ce_loss": 0.9062254428863525 }, { "epoch": 0.07623096697646826, "step": 771, "train/sim_loss": 0.0859375 }, { "epoch": 0.07623096697646826, "step": 771, "train/total_loss": 0.17656004428863525 }, { "entropy": 9.705167770385742, "epoch": 0.07632983982598378, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 4252866.0, "step": 772, "train/ce_loss": 0.45254525542259216 }, { "epoch": 0.07632983982598378, "step": 772, "train/sim_loss": 0.0859375 }, { "epoch": 0.07632983982598378, "step": 772, "train/total_loss": 0.13119202852249146 }, { "entropy": 9.496273040771484, "epoch": 0.0764287126754993, "mean_token_accuracy": 0.7571059465408325, "num_tokens": 4258257.0, "step": 773, "train/ce_loss": 0.7584699988365173 }, { "epoch": 0.0764287126754993, "step": 773, "train/sim_loss": 0.08984375 }, { "epoch": 0.0764287126754993, "step": 773, "train/total_loss": 0.16569074988365173 }, { "entropy": 9.445784568786621, "epoch": 0.07652758552501483, "mean_token_accuracy": 0.7473118305206299, "num_tokens": 4263719.0, "step": 774, "train/ce_loss": 0.6379207372665405 }, { "epoch": 0.07652758552501483, "step": 774, "train/sim_loss": 0.046875 }, { "epoch": 0.07652758552501483, "step": 774, "train/total_loss": 0.11066707223653793 }, { "entropy": 9.32603645324707, "epoch": 0.07662645837453036, "mean_token_accuracy": 0.7318681478500366, "num_tokens": 4269296.0, "step": 775, "train/ce_loss": 0.4980626404285431 }, { "epoch": 0.07662645837453036, "step": 775, "train/sim_loss": 0.07421875 }, { "epoch": 0.07662645837453036, "step": 775, "train/total_loss": 0.12402501702308655 }, { "entropy": 9.222501754760742, "epoch": 0.07672533122404587, "mean_token_accuracy": 0.7440000176429749, "num_tokens": 4274742.0, "step": 776, "train/ce_loss": 0.8437198400497437 }, { "epoch": 0.07672533122404587, "step": 776, "train/sim_loss": 0.1015625 }, { "epoch": 0.07672533122404587, "step": 776, "train/total_loss": 0.18593448400497437 }, { "entropy": 9.258399963378906, "epoch": 0.0768242040735614, "mean_token_accuracy": 0.7156448364257812, "num_tokens": 4280339.0, "step": 777, "train/ce_loss": 1.0738810300827026 }, { "epoch": 0.0768242040735614, "step": 777, "train/sim_loss": 0.12109375 }, { "epoch": 0.0768242040735614, "step": 777, "train/total_loss": 0.22848185896873474 }, { "entropy": 8.862468719482422, "epoch": 0.07692307692307693, "mean_token_accuracy": 0.7092084288597107, "num_tokens": 4286202.0, "step": 778, "train/ce_loss": 0.7340808510780334 }, { "epoch": 0.07692307692307693, "step": 778, "train/sim_loss": 0.09375 }, { "epoch": 0.07692307692307693, "step": 778, "train/total_loss": 0.1671580970287323 }, { "entropy": 9.585132598876953, "epoch": 0.07702194977259244, "mean_token_accuracy": 0.777208685874939, "num_tokens": 4291584.0, "step": 779, "train/ce_loss": 0.9645556807518005 }, { "epoch": 0.07702194977259244, "step": 779, "train/sim_loss": 0.10546875 }, { "epoch": 0.07702194977259244, "step": 779, "train/total_loss": 0.20192432403564453 }, { "epoch": 0.07712082262210797, "grad_norm": 1.0723875761032104, "learning_rate": 9.809869950056868e-06, "loss": 0.1775, "step": 780 }, { "entropy": 9.43270492553711, "epoch": 0.07712082262210797, "mean_token_accuracy": 0.7490494251251221, "num_tokens": 4297045.0, "step": 780, "train/ce_loss": 0.9094752073287964 }, { "epoch": 0.07712082262210797, "step": 780, "train/sim_loss": 0.09765625 }, { "epoch": 0.07712082262210797, "step": 780, "train/total_loss": 0.18860377371311188 }, { "entropy": 9.522924423217773, "epoch": 0.0772196954716235, "mean_token_accuracy": 0.7254641652107239, "num_tokens": 4302440.0, "step": 781, "train/ce_loss": 1.0830698013305664 }, { "epoch": 0.0772196954716235, "step": 781, "train/sim_loss": 0.15625 }, { "epoch": 0.0772196954716235, "step": 781, "train/total_loss": 0.26455697417259216 }, { "entropy": 9.411714553833008, "epoch": 0.07731856832113901, "mean_token_accuracy": 0.7276940941810608, "num_tokens": 4307979.0, "step": 782, "train/ce_loss": 0.9000190496444702 }, { "epoch": 0.07731856832113901, "step": 782, "train/sim_loss": 0.08203125 }, { "epoch": 0.07731856832113901, "step": 782, "train/total_loss": 0.1720331609249115 }, { "entropy": 9.434417724609375, "epoch": 0.07741744117065454, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 4313719.0, "step": 783, "train/ce_loss": 0.6984912753105164 }, { "epoch": 0.07741744117065454, "step": 783, "train/sim_loss": 0.1015625 }, { "epoch": 0.07741744117065454, "step": 783, "train/total_loss": 0.1714116334915161 }, { "entropy": 9.45406436920166, "epoch": 0.07751631402017006, "mean_token_accuracy": 0.7254658341407776, "num_tokens": 4319195.0, "step": 784, "train/ce_loss": 0.5465412139892578 }, { "epoch": 0.07751631402017006, "step": 784, "train/sim_loss": 0.1328125 }, { "epoch": 0.07751631402017006, "step": 784, "train/total_loss": 0.18746662139892578 }, { "entropy": 8.940170288085938, "epoch": 0.07761518686968559, "mean_token_accuracy": 0.739357054233551, "num_tokens": 4324861.0, "step": 785, "train/ce_loss": 0.36967816948890686 }, { "epoch": 0.07761518686968559, "step": 785, "train/sim_loss": 0.04296875 }, { "epoch": 0.07761518686968559, "step": 785, "train/total_loss": 0.07993656396865845 }, { "entropy": 9.08770751953125, "epoch": 0.0777140597192011, "mean_token_accuracy": 0.7765747904777527, "num_tokens": 4330574.0, "step": 786, "train/ce_loss": 0.6479521989822388 }, { "epoch": 0.0777140597192011, "step": 786, "train/sim_loss": 0.17578125 }, { "epoch": 0.0777140597192011, "step": 786, "train/total_loss": 0.24057647585868835 }, { "entropy": 9.308185577392578, "epoch": 0.07781293256871663, "mean_token_accuracy": 0.7486218214035034, "num_tokens": 4336162.0, "step": 787, "train/ce_loss": 0.9546238780021667 }, { "epoch": 0.07781293256871663, "step": 787, "train/sim_loss": 0.109375 }, { "epoch": 0.07781293256871663, "step": 787, "train/total_loss": 0.2048373818397522 }, { "entropy": 9.178385734558105, "epoch": 0.07791180541823216, "mean_token_accuracy": 0.7177974581718445, "num_tokens": 4341809.0, "step": 788, "train/ce_loss": 1.2795974016189575 }, { "epoch": 0.07791180541823216, "step": 788, "train/sim_loss": 0.0859375 }, { "epoch": 0.07791180541823216, "step": 788, "train/total_loss": 0.213897243142128 }, { "entropy": 9.612168312072754, "epoch": 0.07801067826774767, "mean_token_accuracy": 0.7340153455734253, "num_tokens": 4347132.0, "step": 789, "train/ce_loss": 0.9715099334716797 }, { "epoch": 0.07801067826774767, "step": 789, "train/sim_loss": 0.1015625 }, { "epoch": 0.07801067826774767, "step": 789, "train/total_loss": 0.1987134963274002 }, { "entropy": 9.382074356079102, "epoch": 0.0781095511172632, "mean_token_accuracy": 0.7291898131370544, "num_tokens": 4352678.0, "step": 790, "train/ce_loss": 1.279839038848877 }, { "epoch": 0.0781095511172632, "step": 790, "train/sim_loss": 0.1015625 }, { "epoch": 0.0781095511172632, "step": 790, "train/total_loss": 0.2295464128255844 }, { "entropy": 9.496596336364746, "epoch": 0.07820842396677873, "mean_token_accuracy": 0.6992143392562866, "num_tokens": 4358129.0, "step": 791, "train/ce_loss": 0.9078870415687561 }, { "epoch": 0.07820842396677873, "step": 791, "train/sim_loss": 0.08984375 }, { "epoch": 0.07820842396677873, "step": 791, "train/total_loss": 0.18063245713710785 }, { "entropy": 9.78567123413086, "epoch": 0.07830729681629424, "mean_token_accuracy": 0.7446504831314087, "num_tokens": 4363395.0, "step": 792, "train/ce_loss": 0.9810066819190979 }, { "epoch": 0.07830729681629424, "step": 792, "train/sim_loss": 0.078125 }, { "epoch": 0.07830729681629424, "step": 792, "train/total_loss": 0.1762256622314453 }, { "entropy": 9.67739486694336, "epoch": 0.07840616966580977, "mean_token_accuracy": 0.7483355402946472, "num_tokens": 4368704.0, "step": 793, "train/ce_loss": 0.7123100757598877 }, { "epoch": 0.07840616966580977, "step": 793, "train/sim_loss": 0.09765625 }, { "epoch": 0.07840616966580977, "step": 793, "train/total_loss": 0.16888725757598877 }, { "entropy": 9.6588134765625, "epoch": 0.0785050425153253, "mean_token_accuracy": 0.7130801677703857, "num_tokens": 4374061.0, "step": 794, "train/ce_loss": 0.7116860747337341 }, { "epoch": 0.0785050425153253, "step": 794, "train/sim_loss": 0.08984375 }, { "epoch": 0.0785050425153253, "step": 794, "train/total_loss": 0.16101235151290894 }, { "entropy": 9.478083610534668, "epoch": 0.07860391536484082, "mean_token_accuracy": 0.7200461030006409, "num_tokens": 4379539.0, "step": 795, "train/ce_loss": 0.4054833948612213 }, { "epoch": 0.07860391536484082, "step": 795, "train/sim_loss": 0.09375 }, { "epoch": 0.07860391536484082, "step": 795, "train/total_loss": 0.13429833948612213 }, { "entropy": 9.274872779846191, "epoch": 0.07870278821435633, "mean_token_accuracy": 0.7296416759490967, "num_tokens": 4385099.0, "step": 796, "train/ce_loss": 1.06484055519104 }, { "epoch": 0.07870278821435633, "step": 796, "train/sim_loss": 0.12109375 }, { "epoch": 0.07870278821435633, "step": 796, "train/total_loss": 0.227577805519104 }, { "entropy": 9.521102905273438, "epoch": 0.07880166106387186, "mean_token_accuracy": 0.7887668013572693, "num_tokens": 4390433.0, "step": 797, "train/ce_loss": 0.9165420532226562 }, { "epoch": 0.07880166106387186, "step": 797, "train/sim_loss": 0.0390625 }, { "epoch": 0.07880166106387186, "step": 797, "train/total_loss": 0.1307167112827301 }, { "entropy": 9.42955207824707, "epoch": 0.07890053391338739, "mean_token_accuracy": 0.7367773652076721, "num_tokens": 4395861.0, "step": 798, "train/ce_loss": 0.8085328936576843 }, { "epoch": 0.07890053391338739, "step": 798, "train/sim_loss": 0.1171875 }, { "epoch": 0.07890053391338739, "step": 798, "train/total_loss": 0.19804078340530396 }, { "entropy": 9.82484245300293, "epoch": 0.0789994067629029, "mean_token_accuracy": 0.7967115044593811, "num_tokens": 4401161.0, "step": 799, "train/ce_loss": 0.8346916437149048 }, { "epoch": 0.0789994067629029, "step": 799, "train/sim_loss": 0.0625 }, { "epoch": 0.0789994067629029, "step": 799, "train/total_loss": 0.14596916735172272 }, { "epoch": 0.07909827961241843, "grad_norm": 1.1731499433517456, "learning_rate": 9.804925085298918e-06, "loss": 0.1846, "step": 800 }, { "entropy": 9.274038314819336, "epoch": 0.07909827961241843, "mean_token_accuracy": 0.7175824046134949, "num_tokens": 4406709.0, "step": 800, "train/ce_loss": 1.4107813835144043 }, { "epoch": 0.07909827961241843, "step": 800, "train/sim_loss": 0.1328125 }, { "epoch": 0.07909827961241843, "step": 800, "train/total_loss": 0.2738906443119049 }, { "entropy": 9.344045639038086, "epoch": 0.07919715246193396, "mean_token_accuracy": 0.6980306506156921, "num_tokens": 4412221.0, "step": 801, "train/ce_loss": 0.6394026875495911 }, { "epoch": 0.07919715246193396, "step": 801, "train/sim_loss": 0.15625 }, { "epoch": 0.07919715246193396, "step": 801, "train/total_loss": 0.22019027173519135 }, { "entropy": 9.615592002868652, "epoch": 0.07929602531144947, "mean_token_accuracy": 0.7380645275115967, "num_tokens": 4417556.0, "step": 802, "train/ce_loss": 0.9174885153770447 }, { "epoch": 0.07929602531144947, "step": 802, "train/sim_loss": 0.0703125 }, { "epoch": 0.07929602531144947, "step": 802, "train/total_loss": 0.16206136345863342 }, { "entropy": 9.446781158447266, "epoch": 0.079394898160965, "mean_token_accuracy": 0.7295597195625305, "num_tokens": 4422981.0, "step": 803, "train/ce_loss": 0.8224945068359375 }, { "epoch": 0.079394898160965, "step": 803, "train/sim_loss": 0.1171875 }, { "epoch": 0.079394898160965, "step": 803, "train/total_loss": 0.1994369626045227 }, { "entropy": 9.643782615661621, "epoch": 0.07949377101048052, "mean_token_accuracy": 0.7276478409767151, "num_tokens": 4428291.0, "step": 804, "train/ce_loss": 0.7969783544540405 }, { "epoch": 0.07949377101048052, "step": 804, "train/sim_loss": 0.08984375 }, { "epoch": 0.07949377101048052, "step": 804, "train/total_loss": 0.169541597366333 }, { "entropy": 9.057640075683594, "epoch": 0.07959264385999605, "mean_token_accuracy": 0.6988171339035034, "num_tokens": 4433883.0, "step": 805, "train/ce_loss": 0.7382174730300903 }, { "epoch": 0.07959264385999605, "step": 805, "train/sim_loss": 0.10546875 }, { "epoch": 0.07959264385999605, "step": 805, "train/total_loss": 0.1792905032634735 }, { "entropy": 9.37064266204834, "epoch": 0.07969151670951156, "mean_token_accuracy": 0.6890547275543213, "num_tokens": 4439262.0, "step": 806, "train/ce_loss": 1.1229499578475952 }, { "epoch": 0.07969151670951156, "step": 806, "train/sim_loss": 0.1171875 }, { "epoch": 0.07969151670951156, "step": 806, "train/total_loss": 0.229482501745224 }, { "entropy": 9.1074800491333, "epoch": 0.07979038955902709, "mean_token_accuracy": 0.7259658575057983, "num_tokens": 4444887.0, "step": 807, "train/ce_loss": 0.6643164157867432 }, { "epoch": 0.07979038955902709, "step": 807, "train/sim_loss": 0.12890625 }, { "epoch": 0.07979038955902709, "step": 807, "train/total_loss": 0.19533789157867432 }, { "entropy": 9.457550048828125, "epoch": 0.07988926240854262, "mean_token_accuracy": 0.7506459951400757, "num_tokens": 4450254.0, "step": 808, "train/ce_loss": 0.8655195236206055 }, { "epoch": 0.07988926240854262, "step": 808, "train/sim_loss": 0.109375 }, { "epoch": 0.07988926240854262, "step": 808, "train/total_loss": 0.1959269642829895 }, { "entropy": 9.606245040893555, "epoch": 0.07998813525805813, "mean_token_accuracy": 0.7023959755897522, "num_tokens": 4455601.0, "step": 809, "train/ce_loss": 1.0111688375473022 }, { "epoch": 0.07998813525805813, "step": 809, "train/sim_loss": 0.1015625 }, { "epoch": 0.07998813525805813, "step": 809, "train/total_loss": 0.20267939567565918 }, { "entropy": 9.417563438415527, "epoch": 0.08008700810757366, "mean_token_accuracy": 0.7117318511009216, "num_tokens": 4461150.0, "step": 810, "train/ce_loss": 1.6295825242996216 }, { "epoch": 0.08008700810757366, "step": 810, "train/sim_loss": 0.125 }, { "epoch": 0.08008700810757366, "step": 810, "train/total_loss": 0.2879582643508911 }, { "entropy": 8.938827514648438, "epoch": 0.08018588095708919, "mean_token_accuracy": 0.6973270177841187, "num_tokens": 4466970.0, "step": 811, "train/ce_loss": 0.9889235496520996 }, { "epoch": 0.08018588095708919, "step": 811, "train/sim_loss": 0.11328125 }, { "epoch": 0.08018588095708919, "step": 811, "train/total_loss": 0.21217361092567444 }, { "entropy": 9.687152862548828, "epoch": 0.0802847538066047, "mean_token_accuracy": 0.6964539289474487, "num_tokens": 4472258.0, "step": 812, "train/ce_loss": 0.9444341063499451 }, { "epoch": 0.0802847538066047, "step": 812, "train/sim_loss": 0.0703125 }, { "epoch": 0.0802847538066047, "step": 812, "train/total_loss": 0.1647559106349945 }, { "entropy": 9.434215545654297, "epoch": 0.08038362665612023, "mean_token_accuracy": 0.7371007204055786, "num_tokens": 4477667.0, "step": 813, "train/ce_loss": 0.4823085069656372 }, { "epoch": 0.08038362665612023, "step": 813, "train/sim_loss": 0.11328125 }, { "epoch": 0.08038362665612023, "step": 813, "train/total_loss": 0.1615121066570282 }, { "entropy": 9.438498497009277, "epoch": 0.08048249950563575, "mean_token_accuracy": 0.7050847411155701, "num_tokens": 4483207.0, "step": 814, "train/ce_loss": 1.636538028717041 }, { "epoch": 0.08048249950563575, "step": 814, "train/sim_loss": 0.1171875 }, { "epoch": 0.08048249950563575, "step": 814, "train/total_loss": 0.28084129095077515 }, { "entropy": 9.504042625427246, "epoch": 0.08058137235515128, "mean_token_accuracy": 0.7338501214981079, "num_tokens": 4488644.0, "step": 815, "train/ce_loss": 0.9487102031707764 }, { "epoch": 0.08058137235515128, "step": 815, "train/sim_loss": 0.05078125 }, { "epoch": 0.08058137235515128, "step": 815, "train/total_loss": 0.14565226435661316 }, { "entropy": 9.328442573547363, "epoch": 0.0806802452046668, "mean_token_accuracy": 0.7654584050178528, "num_tokens": 4494184.0, "step": 816, "train/ce_loss": 0.4445994794368744 }, { "epoch": 0.0806802452046668, "step": 816, "train/sim_loss": 0.0546875 }, { "epoch": 0.0806802452046668, "step": 816, "train/total_loss": 0.09914745390415192 }, { "entropy": 9.465474128723145, "epoch": 0.08077911805418232, "mean_token_accuracy": 0.7263279557228088, "num_tokens": 4499626.0, "step": 817, "train/ce_loss": 0.47822391986846924 }, { "epoch": 0.08077911805418232, "step": 817, "train/sim_loss": 0.03515625 }, { "epoch": 0.08077911805418232, "step": 817, "train/total_loss": 0.08297864347696304 }, { "entropy": 9.391244888305664, "epoch": 0.08087799090369785, "mean_token_accuracy": 0.7609987854957581, "num_tokens": 4505089.0, "step": 818, "train/ce_loss": 0.8075358867645264 }, { "epoch": 0.08087799090369785, "step": 818, "train/sim_loss": 0.0390625 }, { "epoch": 0.08087799090369785, "step": 818, "train/total_loss": 0.11981608718633652 }, { "entropy": 9.63552188873291, "epoch": 0.08097686375321336, "mean_token_accuracy": 0.6845549941062927, "num_tokens": 4510453.0, "step": 819, "train/ce_loss": 1.093739628791809 }, { "epoch": 0.08097686375321336, "step": 819, "train/sim_loss": 0.08984375 }, { "epoch": 0.08097686375321336, "step": 819, "train/total_loss": 0.19921770691871643 }, { "epoch": 0.08107573660272889, "grad_norm": 1.2534371614456177, "learning_rate": 9.799980220540969e-06, "loss": 0.1866, "step": 820 }, { "entropy": 9.010088920593262, "epoch": 0.08107573660272889, "mean_token_accuracy": 0.7017874121665955, "num_tokens": 4516114.0, "step": 820, "train/ce_loss": 0.5793368220329285 }, { "epoch": 0.08107573660272889, "step": 820, "train/sim_loss": 0.1328125 }, { "epoch": 0.08107573660272889, "step": 820, "train/total_loss": 0.19074618816375732 }, { "entropy": 9.72114086151123, "epoch": 0.08117460945224442, "mean_token_accuracy": 0.710806667804718, "num_tokens": 4521318.0, "step": 821, "train/ce_loss": 0.9095756411552429 }, { "epoch": 0.08117460945224442, "step": 821, "train/sim_loss": 0.1015625 }, { "epoch": 0.08117460945224442, "step": 821, "train/total_loss": 0.19252006709575653 }, { "entropy": 9.003988265991211, "epoch": 0.08127348230175993, "mean_token_accuracy": 0.7351460456848145, "num_tokens": 4526956.0, "step": 822, "train/ce_loss": 0.9122998118400574 }, { "epoch": 0.08127348230175993, "step": 822, "train/sim_loss": 0.12890625 }, { "epoch": 0.08127348230175993, "step": 822, "train/total_loss": 0.22013622522354126 }, { "entropy": 9.285892486572266, "epoch": 0.08137235515127546, "mean_token_accuracy": 0.7508611083030701, "num_tokens": 4532421.0, "step": 823, "train/ce_loss": 0.770931601524353 }, { "epoch": 0.08137235515127546, "step": 823, "train/sim_loss": 0.03515625 }, { "epoch": 0.08137235515127546, "step": 823, "train/total_loss": 0.11224941164255142 }, { "entropy": 9.284485816955566, "epoch": 0.08147122800079099, "mean_token_accuracy": 0.794731080532074, "num_tokens": 4537979.0, "step": 824, "train/ce_loss": 0.7350627779960632 }, { "epoch": 0.08147122800079099, "step": 824, "train/sim_loss": 0.09375 }, { "epoch": 0.08147122800079099, "step": 824, "train/total_loss": 0.16725628077983856 }, { "entropy": 9.299266815185547, "epoch": 0.08157010085030651, "mean_token_accuracy": 0.75, "num_tokens": 4543520.0, "step": 825, "train/ce_loss": 1.089153528213501 }, { "epoch": 0.08157010085030651, "step": 825, "train/sim_loss": 0.1796875 }, { "epoch": 0.08157010085030651, "step": 825, "train/total_loss": 0.2886028587818146 }, { "entropy": 9.742826461791992, "epoch": 0.08166897369982203, "mean_token_accuracy": 0.7399103045463562, "num_tokens": 4548734.0, "step": 826, "train/ce_loss": 1.0690205097198486 }, { "epoch": 0.08166897369982203, "step": 826, "train/sim_loss": 0.10546875 }, { "epoch": 0.08166897369982203, "step": 826, "train/total_loss": 0.21237081289291382 }, { "entropy": 9.397270202636719, "epoch": 0.08176784654933755, "mean_token_accuracy": 0.746198832988739, "num_tokens": 4554223.0, "step": 827, "train/ce_loss": 0.5500451326370239 }, { "epoch": 0.08176784654933755, "step": 827, "train/sim_loss": 0.1640625 }, { "epoch": 0.08176784654933755, "step": 827, "train/total_loss": 0.21906700730323792 }, { "entropy": 9.360734939575195, "epoch": 0.08186671939885308, "mean_token_accuracy": 0.7011111378669739, "num_tokens": 4559721.0, "step": 828, "train/ce_loss": 0.7135817408561707 }, { "epoch": 0.08186671939885308, "step": 828, "train/sim_loss": 0.0390625 }, { "epoch": 0.08186671939885308, "step": 828, "train/total_loss": 0.11042067408561707 }, { "entropy": 9.139347076416016, "epoch": 0.0819655922483686, "mean_token_accuracy": 0.7419999837875366, "num_tokens": 4565343.0, "step": 829, "train/ce_loss": 1.1887017488479614 }, { "epoch": 0.0819655922483686, "step": 829, "train/sim_loss": 0.1328125 }, { "epoch": 0.0819655922483686, "step": 829, "train/total_loss": 0.25168266892433167 }, { "entropy": 9.348533630371094, "epoch": 0.08206446509788412, "mean_token_accuracy": 0.7391892075538635, "num_tokens": 4570738.0, "step": 830, "train/ce_loss": 0.7430282831192017 }, { "epoch": 0.08206446509788412, "step": 830, "train/sim_loss": 0.08984375 }, { "epoch": 0.08206446509788412, "step": 830, "train/total_loss": 0.1641465723514557 }, { "entropy": 9.583702087402344, "epoch": 0.08216333794739965, "mean_token_accuracy": 0.7526041865348816, "num_tokens": 4576153.0, "step": 831, "train/ce_loss": 0.3657197654247284 }, { "epoch": 0.08216333794739965, "step": 831, "train/sim_loss": 0.0703125 }, { "epoch": 0.08216333794739965, "step": 831, "train/total_loss": 0.10688447952270508 }, { "entropy": 9.380016326904297, "epoch": 0.08226221079691516, "mean_token_accuracy": 0.7756410241127014, "num_tokens": 4581594.0, "step": 832, "train/ce_loss": 0.767584502696991 }, { "epoch": 0.08226221079691516, "step": 832, "train/sim_loss": 0.03125 }, { "epoch": 0.08226221079691516, "step": 832, "train/total_loss": 0.10800845175981522 }, { "entropy": 9.28530216217041, "epoch": 0.08236108364643069, "mean_token_accuracy": 0.7465388774871826, "num_tokens": 4587150.0, "step": 833, "train/ce_loss": 0.938029408454895 }, { "epoch": 0.08236108364643069, "step": 833, "train/sim_loss": 0.1015625 }, { "epoch": 0.08236108364643069, "step": 833, "train/total_loss": 0.19536544382572174 }, { "entropy": 9.232719421386719, "epoch": 0.08245995649594622, "mean_token_accuracy": 0.7084168195724487, "num_tokens": 4592832.0, "step": 834, "train/ce_loss": 0.3290528357028961 }, { "epoch": 0.08245995649594622, "step": 834, "train/sim_loss": 0.09765625 }, { "epoch": 0.08245995649594622, "step": 834, "train/total_loss": 0.13056153059005737 }, { "entropy": 9.22658920288086, "epoch": 0.08255882934546174, "mean_token_accuracy": 0.696739137172699, "num_tokens": 4598387.0, "step": 835, "train/ce_loss": 1.1875358819961548 }, { "epoch": 0.08255882934546174, "step": 835, "train/sim_loss": 0.109375 }, { "epoch": 0.08255882934546174, "step": 835, "train/total_loss": 0.228128582239151 }, { "entropy": 9.17701530456543, "epoch": 0.08265770219497726, "mean_token_accuracy": 0.7338709831237793, "num_tokens": 4603812.0, "step": 836, "train/ce_loss": 1.207872748374939 }, { "epoch": 0.08265770219497726, "step": 836, "train/sim_loss": 0.15234375 }, { "epoch": 0.08265770219497726, "step": 836, "train/total_loss": 0.27313101291656494 }, { "entropy": 9.360685348510742, "epoch": 0.08275657504449278, "mean_token_accuracy": 0.7243436574935913, "num_tokens": 4609294.0, "step": 837, "train/ce_loss": 0.9400960803031921 }, { "epoch": 0.08275657504449278, "step": 837, "train/sim_loss": 0.1640625 }, { "epoch": 0.08275657504449278, "step": 837, "train/total_loss": 0.2580721080303192 }, { "entropy": 9.39744758605957, "epoch": 0.08285544789400831, "mean_token_accuracy": 0.75, "num_tokens": 4614838.0, "step": 838, "train/ce_loss": 0.5984823703765869 }, { "epoch": 0.08285544789400831, "step": 838, "train/sim_loss": 0.09765625 }, { "epoch": 0.08285544789400831, "step": 838, "train/total_loss": 0.15750448405742645 }, { "entropy": 9.604251861572266, "epoch": 0.08295432074352382, "mean_token_accuracy": 0.7657296061515808, "num_tokens": 4620170.0, "step": 839, "train/ce_loss": 0.9629525542259216 }, { "epoch": 0.08295432074352382, "step": 839, "train/sim_loss": 0.08984375 }, { "epoch": 0.08295432074352382, "step": 839, "train/total_loss": 0.18613901734352112 }, { "epoch": 0.08305319359303935, "grad_norm": 0.9733732342720032, "learning_rate": 9.79503535578302e-06, "loss": 0.1821, "step": 840 }, { "entropy": 9.343636512756348, "epoch": 0.08305319359303935, "mean_token_accuracy": 0.7245370149612427, "num_tokens": 4625667.0, "step": 840, "train/ce_loss": 0.6069525480270386 }, { "epoch": 0.08305319359303935, "step": 840, "train/sim_loss": 0.0703125 }, { "epoch": 0.08305319359303935, "step": 840, "train/total_loss": 0.13100776076316833 }, { "entropy": 9.397802352905273, "epoch": 0.08315206644255488, "mean_token_accuracy": 0.7330960631370544, "num_tokens": 4631141.0, "step": 841, "train/ce_loss": 1.3006961345672607 }, { "epoch": 0.08315206644255488, "step": 841, "train/sim_loss": 0.1328125 }, { "epoch": 0.08315206644255488, "step": 841, "train/total_loss": 0.2628821134567261 }, { "entropy": 9.608840942382812, "epoch": 0.08325093929207039, "mean_token_accuracy": 0.7257142663002014, "num_tokens": 4636496.0, "step": 842, "train/ce_loss": 0.7145105004310608 }, { "epoch": 0.08325093929207039, "step": 842, "train/sim_loss": 0.1015625 }, { "epoch": 0.08325093929207039, "step": 842, "train/total_loss": 0.17301355302333832 }, { "entropy": 9.225297927856445, "epoch": 0.08334981214158592, "mean_token_accuracy": 0.7326943278312683, "num_tokens": 4642085.0, "step": 843, "train/ce_loss": 0.8060480952262878 }, { "epoch": 0.08334981214158592, "step": 843, "train/sim_loss": 0.1015625 }, { "epoch": 0.08334981214158592, "step": 843, "train/total_loss": 0.18216732144355774 }, { "entropy": 9.535888671875, "epoch": 0.08344868499110145, "mean_token_accuracy": 0.6859903335571289, "num_tokens": 4647497.0, "step": 844, "train/ce_loss": 0.7223190665245056 }, { "epoch": 0.08344868499110145, "step": 844, "train/sim_loss": 0.1015625 }, { "epoch": 0.08344868499110145, "step": 844, "train/total_loss": 0.17379441857337952 }, { "entropy": 9.20891284942627, "epoch": 0.08354755784061697, "mean_token_accuracy": 0.7077577114105225, "num_tokens": 4653096.0, "step": 845, "train/ce_loss": 0.9363729357719421 }, { "epoch": 0.08354755784061697, "step": 845, "train/sim_loss": 0.1328125 }, { "epoch": 0.08354755784061697, "step": 845, "train/total_loss": 0.22644978761672974 }, { "entropy": 9.275154113769531, "epoch": 0.08364643069013249, "mean_token_accuracy": 0.7118847370147705, "num_tokens": 4658578.0, "step": 846, "train/ce_loss": 0.8228893876075745 }, { "epoch": 0.08364643069013249, "step": 846, "train/sim_loss": 0.08984375 }, { "epoch": 0.08364643069013249, "step": 846, "train/total_loss": 0.1721327006816864 }, { "entropy": 9.315937042236328, "epoch": 0.08374530353964801, "mean_token_accuracy": 0.7202312350273132, "num_tokens": 4664009.0, "step": 847, "train/ce_loss": 0.7127370834350586 }, { "epoch": 0.08374530353964801, "step": 847, "train/sim_loss": 0.0703125 }, { "epoch": 0.08374530353964801, "step": 847, "train/total_loss": 0.14158621430397034 }, { "entropy": 9.096430778503418, "epoch": 0.08384417638916354, "mean_token_accuracy": 0.7056768536567688, "num_tokens": 4669760.0, "step": 848, "train/ce_loss": 0.7465432286262512 }, { "epoch": 0.08384417638916354, "step": 848, "train/sim_loss": 0.14453125 }, { "epoch": 0.08384417638916354, "step": 848, "train/total_loss": 0.21918557584285736 }, { "entropy": 9.020623207092285, "epoch": 0.08394304923867905, "mean_token_accuracy": 0.7511394619941711, "num_tokens": 4675403.0, "step": 849, "train/ce_loss": 0.6737596392631531 }, { "epoch": 0.08394304923867905, "step": 849, "train/sim_loss": 0.09765625 }, { "epoch": 0.08394304923867905, "step": 849, "train/total_loss": 0.16503220796585083 }, { "entropy": 9.301700592041016, "epoch": 0.08404192208819458, "mean_token_accuracy": 0.7677018642425537, "num_tokens": 4680857.0, "step": 850, "train/ce_loss": 0.6431403160095215 }, { "epoch": 0.08404192208819458, "step": 850, "train/sim_loss": 0.0390625 }, { "epoch": 0.08404192208819458, "step": 850, "train/total_loss": 0.10337653011083603 }, { "entropy": 9.437649726867676, "epoch": 0.08414079493771011, "mean_token_accuracy": 0.727148711681366, "num_tokens": 4686141.0, "step": 851, "train/ce_loss": 0.9740714430809021 }, { "epoch": 0.08414079493771011, "step": 851, "train/sim_loss": 0.0546875 }, { "epoch": 0.08414079493771011, "step": 851, "train/total_loss": 0.15209464728832245 }, { "entropy": 9.298521041870117, "epoch": 0.08423966778722562, "mean_token_accuracy": 0.7249224185943604, "num_tokens": 4691595.0, "step": 852, "train/ce_loss": 0.931045651435852 }, { "epoch": 0.08423966778722562, "step": 852, "train/sim_loss": 0.06640625 }, { "epoch": 0.08423966778722562, "step": 852, "train/total_loss": 0.15951082110404968 }, { "entropy": 9.663223266601562, "epoch": 0.08433854063674115, "mean_token_accuracy": 0.7373737096786499, "num_tokens": 4696840.0, "step": 853, "train/ce_loss": 0.9588896036148071 }, { "epoch": 0.08433854063674115, "step": 853, "train/sim_loss": 0.06640625 }, { "epoch": 0.08433854063674115, "step": 853, "train/total_loss": 0.16229522228240967 }, { "entropy": 9.483179092407227, "epoch": 0.08443741348625668, "mean_token_accuracy": 0.7219321131706238, "num_tokens": 4702256.0, "step": 854, "train/ce_loss": 1.162400245666504 }, { "epoch": 0.08443741348625668, "step": 854, "train/sim_loss": 0.125 }, { "epoch": 0.08443741348625668, "step": 854, "train/total_loss": 0.2412400245666504 }, { "entropy": 9.643060684204102, "epoch": 0.0845362863357722, "mean_token_accuracy": 0.7488299608230591, "num_tokens": 4707535.0, "step": 855, "train/ce_loss": 0.8555900454521179 }, { "epoch": 0.0845362863357722, "step": 855, "train/sim_loss": 0.05859375 }, { "epoch": 0.0845362863357722, "step": 855, "train/total_loss": 0.14415276050567627 }, { "entropy": 9.54561996459961, "epoch": 0.08463515918528772, "mean_token_accuracy": 0.7366412281990051, "num_tokens": 4712895.0, "step": 856, "train/ce_loss": 0.6866753101348877 }, { "epoch": 0.08463515918528772, "step": 856, "train/sim_loss": 0.08984375 }, { "epoch": 0.08463515918528772, "step": 856, "train/total_loss": 0.15851128101348877 }, { "entropy": 9.66610336303711, "epoch": 0.08473403203480324, "mean_token_accuracy": 0.757446825504303, "num_tokens": 4718231.0, "step": 857, "train/ce_loss": 0.7062276005744934 }, { "epoch": 0.08473403203480324, "step": 857, "train/sim_loss": 0.05078125 }, { "epoch": 0.08473403203480324, "step": 857, "train/total_loss": 0.1214040145277977 }, { "entropy": 9.345430374145508, "epoch": 0.08483290488431877, "mean_token_accuracy": 0.7451205253601074, "num_tokens": 4723762.0, "step": 858, "train/ce_loss": 0.7179973721504211 }, { "epoch": 0.08483290488431877, "step": 858, "train/sim_loss": 0.046875 }, { "epoch": 0.08483290488431877, "step": 858, "train/total_loss": 0.11867474019527435 }, { "entropy": 9.22994327545166, "epoch": 0.08493177773383428, "mean_token_accuracy": 0.7010309100151062, "num_tokens": 4729364.0, "step": 859, "train/ce_loss": 1.6779733896255493 }, { "epoch": 0.08493177773383428, "step": 859, "train/sim_loss": 0.13671875 }, { "epoch": 0.08493177773383428, "step": 859, "train/total_loss": 0.304516077041626 }, { "epoch": 0.08503065058334981, "grad_norm": 1.2609254121780396, "learning_rate": 9.790090491025071e-06, "loss": 0.1902, "step": 860 }, { "entropy": 9.051291465759277, "epoch": 0.08503065058334981, "mean_token_accuracy": 0.6889097690582275, "num_tokens": 4735115.0, "step": 860, "train/ce_loss": 0.9129635691642761 }, { "epoch": 0.08503065058334981, "step": 860, "train/sim_loss": 0.11328125 }, { "epoch": 0.08503065058334981, "step": 860, "train/total_loss": 0.20457760989665985 }, { "entropy": 9.528076171875, "epoch": 0.08512952343286534, "mean_token_accuracy": 0.7113276720046997, "num_tokens": 4740525.0, "step": 861, "train/ce_loss": 0.7400087714195251 }, { "epoch": 0.08512952343286534, "step": 861, "train/sim_loss": 0.140625 }, { "epoch": 0.08512952343286534, "step": 861, "train/total_loss": 0.21462588012218475 }, { "entropy": 9.439704895019531, "epoch": 0.08522839628238085, "mean_token_accuracy": 0.7422434091567993, "num_tokens": 4745988.0, "step": 862, "train/ce_loss": 1.7253462076187134 }, { "epoch": 0.08522839628238085, "step": 862, "train/sim_loss": 0.1328125 }, { "epoch": 0.08522839628238085, "step": 862, "train/total_loss": 0.30534714460372925 }, { "entropy": 9.186038970947266, "epoch": 0.08532726913189638, "mean_token_accuracy": 0.7550802230834961, "num_tokens": 4751582.0, "step": 863, "train/ce_loss": 0.8691436648368835 }, { "epoch": 0.08532726913189638, "step": 863, "train/sim_loss": 0.0390625 }, { "epoch": 0.08532726913189638, "step": 863, "train/total_loss": 0.12597686052322388 }, { "entropy": 9.093668937683105, "epoch": 0.08542614198141191, "mean_token_accuracy": 0.7037392258644104, "num_tokens": 4757218.0, "step": 864, "train/ce_loss": 0.4905575215816498 }, { "epoch": 0.08542614198141191, "step": 864, "train/sim_loss": 0.0625 }, { "epoch": 0.08542614198141191, "step": 864, "train/total_loss": 0.11155575513839722 }, { "entropy": 9.096546173095703, "epoch": 0.08552501483092742, "mean_token_accuracy": 0.8016967177391052, "num_tokens": 4762788.0, "step": 865, "train/ce_loss": 0.44172248244285583 }, { "epoch": 0.08552501483092742, "step": 865, "train/sim_loss": 0.03125 }, { "epoch": 0.08552501483092742, "step": 865, "train/total_loss": 0.0754222497344017 }, { "entropy": 9.69709300994873, "epoch": 0.08562388768044295, "mean_token_accuracy": 0.727393627166748, "num_tokens": 4768099.0, "step": 866, "train/ce_loss": 0.9208095073699951 }, { "epoch": 0.08562388768044295, "step": 866, "train/sim_loss": 0.078125 }, { "epoch": 0.08562388768044295, "step": 866, "train/total_loss": 0.1702059507369995 }, { "entropy": 9.254475593566895, "epoch": 0.08572276052995847, "mean_token_accuracy": 0.6962025165557861, "num_tokens": 4773570.0, "step": 867, "train/ce_loss": 1.0801947116851807 }, { "epoch": 0.08572276052995847, "step": 867, "train/sim_loss": 0.0859375 }, { "epoch": 0.08572276052995847, "step": 867, "train/total_loss": 0.19395697116851807 }, { "entropy": 9.215788841247559, "epoch": 0.085821633379474, "mean_token_accuracy": 0.7128205299377441, "num_tokens": 4779189.0, "step": 868, "train/ce_loss": 0.8374297022819519 }, { "epoch": 0.085821633379474, "step": 868, "train/sim_loss": 0.0546875 }, { "epoch": 0.085821633379474, "step": 868, "train/total_loss": 0.13843047618865967 }, { "entropy": 9.041790962219238, "epoch": 0.08592050622898952, "mean_token_accuracy": 0.7443744540214539, "num_tokens": 4784898.0, "step": 869, "train/ce_loss": 0.5486629605293274 }, { "epoch": 0.08592050622898952, "step": 869, "train/sim_loss": 0.125 }, { "epoch": 0.08592050622898952, "step": 869, "train/total_loss": 0.17986629903316498 }, { "entropy": 9.389067649841309, "epoch": 0.08601937907850504, "mean_token_accuracy": 0.7356181144714355, "num_tokens": 4790339.0, "step": 870, "train/ce_loss": 0.5774324536323547 }, { "epoch": 0.08601937907850504, "step": 870, "train/sim_loss": 0.04296875 }, { "epoch": 0.08601937907850504, "step": 870, "train/total_loss": 0.10071200132369995 }, { "entropy": 9.868613243103027, "epoch": 0.08611825192802057, "mean_token_accuracy": 0.7487603425979614, "num_tokens": 4795558.0, "step": 871, "train/ce_loss": 0.8800450563430786 }, { "epoch": 0.08611825192802057, "step": 871, "train/sim_loss": 0.07421875 }, { "epoch": 0.08611825192802057, "step": 871, "train/total_loss": 0.16222324967384338 }, { "entropy": 9.400403022766113, "epoch": 0.08621712477753608, "mean_token_accuracy": 0.7068126797676086, "num_tokens": 4801018.0, "step": 872, "train/ce_loss": 0.6158300042152405 }, { "epoch": 0.08621712477753608, "step": 872, "train/sim_loss": 0.15234375 }, { "epoch": 0.08621712477753608, "step": 872, "train/total_loss": 0.2139267474412918 }, { "entropy": 9.673697471618652, "epoch": 0.08631599762705161, "mean_token_accuracy": 0.7893258333206177, "num_tokens": 4806254.0, "step": 873, "train/ce_loss": 0.6224079728126526 }, { "epoch": 0.08631599762705161, "step": 873, "train/sim_loss": 0.13671875 }, { "epoch": 0.08631599762705161, "step": 873, "train/total_loss": 0.19895954430103302 }, { "entropy": 9.459856033325195, "epoch": 0.08641487047656714, "mean_token_accuracy": 0.7637698650360107, "num_tokens": 4811673.0, "step": 874, "train/ce_loss": 0.6243445873260498 }, { "epoch": 0.08641487047656714, "step": 874, "train/sim_loss": 0.109375 }, { "epoch": 0.08641487047656714, "step": 874, "train/total_loss": 0.17180946469306946 }, { "entropy": 9.47743034362793, "epoch": 0.08651374332608265, "mean_token_accuracy": 0.7311828136444092, "num_tokens": 4817068.0, "step": 875, "train/ce_loss": 0.8701171875 }, { "epoch": 0.08651374332608265, "step": 875, "train/sim_loss": 0.0859375 }, { "epoch": 0.08651374332608265, "step": 875, "train/total_loss": 0.17294922471046448 }, { "entropy": 9.507213592529297, "epoch": 0.08661261617559818, "mean_token_accuracy": 0.7081151604652405, "num_tokens": 4822372.0, "step": 876, "train/ce_loss": 1.1269277334213257 }, { "epoch": 0.08661261617559818, "step": 876, "train/sim_loss": 0.109375 }, { "epoch": 0.08661261617559818, "step": 876, "train/total_loss": 0.22206777334213257 }, { "entropy": 9.126535415649414, "epoch": 0.0867114890251137, "mean_token_accuracy": 0.7306910753250122, "num_tokens": 4828014.0, "step": 877, "train/ce_loss": 0.7168021202087402 }, { "epoch": 0.0867114890251137, "step": 877, "train/sim_loss": 0.1171875 }, { "epoch": 0.0867114890251137, "step": 877, "train/total_loss": 0.1888677179813385 }, { "entropy": 9.167840957641602, "epoch": 0.08681036187462923, "mean_token_accuracy": 0.7198697328567505, "num_tokens": 4833564.0, "step": 878, "train/ce_loss": 1.2187457084655762 }, { "epoch": 0.08681036187462923, "step": 878, "train/sim_loss": 0.109375 }, { "epoch": 0.08681036187462923, "step": 878, "train/total_loss": 0.23124957084655762 }, { "entropy": 9.341957092285156, "epoch": 0.08690923472414475, "mean_token_accuracy": 0.7653179168701172, "num_tokens": 4839067.0, "step": 879, "train/ce_loss": 0.689608633518219 }, { "epoch": 0.08690923472414475, "step": 879, "train/sim_loss": 0.08203125 }, { "epoch": 0.08690923472414475, "step": 879, "train/total_loss": 0.15099212527275085 }, { "epoch": 0.08700810757366027, "grad_norm": 0.9398627281188965, "learning_rate": 9.785145626267124e-06, "loss": 0.1812, "step": 880 }, { "entropy": 9.232715606689453, "epoch": 0.08700810757366027, "mean_token_accuracy": 0.7567264437675476, "num_tokens": 4844652.0, "step": 880, "train/ce_loss": 0.7297532558441162 }, { "epoch": 0.08700810757366027, "step": 880, "train/sim_loss": 0.03125 }, { "epoch": 0.08700810757366027, "step": 880, "train/total_loss": 0.10422533005475998 }, { "entropy": 9.292266845703125, "epoch": 0.0871069804231758, "mean_token_accuracy": 0.7292110919952393, "num_tokens": 4850185.0, "step": 881, "train/ce_loss": 1.0922445058822632 }, { "epoch": 0.0871069804231758, "step": 881, "train/sim_loss": 0.07421875 }, { "epoch": 0.0871069804231758, "step": 881, "train/total_loss": 0.18344320356845856 }, { "entropy": 9.261861801147461, "epoch": 0.08720585327269131, "mean_token_accuracy": 0.7137203216552734, "num_tokens": 4855555.0, "step": 882, "train/ce_loss": 1.5479929447174072 }, { "epoch": 0.08720585327269131, "step": 882, "train/sim_loss": 0.08203125 }, { "epoch": 0.08720585327269131, "step": 882, "train/total_loss": 0.23683054745197296 }, { "entropy": 9.418441772460938, "epoch": 0.08730472612220684, "mean_token_accuracy": 0.7406103014945984, "num_tokens": 4861056.0, "step": 883, "train/ce_loss": 1.2506126165390015 }, { "epoch": 0.08730472612220684, "step": 883, "train/sim_loss": 0.125 }, { "epoch": 0.08730472612220684, "step": 883, "train/total_loss": 0.2500612735748291 }, { "entropy": 9.315123558044434, "epoch": 0.08740359897172237, "mean_token_accuracy": 0.7539393901824951, "num_tokens": 4866489.0, "step": 884, "train/ce_loss": 0.790576696395874 }, { "epoch": 0.08740359897172237, "step": 884, "train/sim_loss": 0.046875 }, { "epoch": 0.08740359897172237, "step": 884, "train/total_loss": 0.12593266367912292 }, { "entropy": 9.298824310302734, "epoch": 0.08750247182123788, "mean_token_accuracy": 0.681664764881134, "num_tokens": 4871983.0, "step": 885, "train/ce_loss": 1.390149712562561 }, { "epoch": 0.08750247182123788, "step": 885, "train/sim_loss": 0.1484375 }, { "epoch": 0.08750247182123788, "step": 885, "train/total_loss": 0.28745245933532715 }, { "entropy": 8.939806938171387, "epoch": 0.08760134467075341, "mean_token_accuracy": 0.7253649830818176, "num_tokens": 4877694.0, "step": 886, "train/ce_loss": 0.6502774357795715 }, { "epoch": 0.08760134467075341, "step": 886, "train/sim_loss": 0.0859375 }, { "epoch": 0.08760134467075341, "step": 886, "train/total_loss": 0.15096524357795715 }, { "entropy": 9.520779609680176, "epoch": 0.08770021752026894, "mean_token_accuracy": 0.7427785396575928, "num_tokens": 4882990.0, "step": 887, "train/ce_loss": 0.5520675778388977 }, { "epoch": 0.08770021752026894, "step": 887, "train/sim_loss": 0.08984375 }, { "epoch": 0.08770021752026894, "step": 887, "train/total_loss": 0.145050510764122 }, { "entropy": 9.452085494995117, "epoch": 0.08779909036978446, "mean_token_accuracy": 0.7789072394371033, "num_tokens": 4888404.0, "step": 888, "train/ce_loss": 0.7599917054176331 }, { "epoch": 0.08779909036978446, "step": 888, "train/sim_loss": 0.03515625 }, { "epoch": 0.08779909036978446, "step": 888, "train/total_loss": 0.1111554205417633 }, { "entropy": 8.973386764526367, "epoch": 0.08789796321929998, "mean_token_accuracy": 0.6783505082130432, "num_tokens": 4893976.0, "step": 889, "train/ce_loss": 1.8136621713638306 }, { "epoch": 0.08789796321929998, "step": 889, "train/sim_loss": 0.109375 }, { "epoch": 0.08789796321929998, "step": 889, "train/total_loss": 0.2907412052154541 }, { "entropy": 9.462360382080078, "epoch": 0.0879968360688155, "mean_token_accuracy": 0.7297297120094299, "num_tokens": 4899353.0, "step": 890, "train/ce_loss": 1.00143301486969 }, { "epoch": 0.0879968360688155, "step": 890, "train/sim_loss": 0.09765625 }, { "epoch": 0.0879968360688155, "step": 890, "train/total_loss": 0.19779956340789795 }, { "entropy": 9.401653289794922, "epoch": 0.08809570891833103, "mean_token_accuracy": 0.7623529434204102, "num_tokens": 4904808.0, "step": 891, "train/ce_loss": 0.6163727045059204 }, { "epoch": 0.08809570891833103, "step": 891, "train/sim_loss": 0.09375 }, { "epoch": 0.08809570891833103, "step": 891, "train/total_loss": 0.1553872674703598 }, { "entropy": 9.614057540893555, "epoch": 0.08819458176784654, "mean_token_accuracy": 0.7043941617012024, "num_tokens": 4910134.0, "step": 892, "train/ce_loss": 0.8787193298339844 }, { "epoch": 0.08819458176784654, "step": 892, "train/sim_loss": 0.109375 }, { "epoch": 0.08819458176784654, "step": 892, "train/total_loss": 0.19724693894386292 }, { "entropy": 9.188618659973145, "epoch": 0.08829345461736207, "mean_token_accuracy": 0.7866666913032532, "num_tokens": 4915773.0, "step": 893, "train/ce_loss": 1.2477495670318604 }, { "epoch": 0.08829345461736207, "step": 893, "train/sim_loss": 0.12890625 }, { "epoch": 0.08829345461736207, "step": 893, "train/total_loss": 0.2536812126636505 }, { "entropy": 9.226936340332031, "epoch": 0.0883923274668776, "mean_token_accuracy": 0.7483296394348145, "num_tokens": 4921304.0, "step": 894, "train/ce_loss": 0.5140079259872437 }, { "epoch": 0.0883923274668776, "step": 894, "train/sim_loss": 0.04296875 }, { "epoch": 0.0883923274668776, "step": 894, "train/total_loss": 0.0943695455789566 }, { "entropy": 9.300542831420898, "epoch": 0.08849120031639311, "mean_token_accuracy": 0.7598152160644531, "num_tokens": 4926750.0, "step": 895, "train/ce_loss": 0.7226269245147705 }, { "epoch": 0.08849120031639311, "step": 895, "train/sim_loss": 0.09375 }, { "epoch": 0.08849120031639311, "step": 895, "train/total_loss": 0.166012704372406 }, { "entropy": 9.24853515625, "epoch": 0.08859007316590864, "mean_token_accuracy": 0.7792362570762634, "num_tokens": 4932206.0, "step": 896, "train/ce_loss": 0.6589966416358948 }, { "epoch": 0.08859007316590864, "step": 896, "train/sim_loss": 0.109375 }, { "epoch": 0.08859007316590864, "step": 896, "train/total_loss": 0.17527467012405396 }, { "entropy": 9.006208419799805, "epoch": 0.08868894601542417, "mean_token_accuracy": 0.669548511505127, "num_tokens": 4937956.0, "step": 897, "train/ce_loss": 1.5489122867584229 }, { "epoch": 0.08868894601542417, "step": 897, "train/sim_loss": 0.09765625 }, { "epoch": 0.08868894601542417, "step": 897, "train/total_loss": 0.2525475025177002 }, { "entropy": 9.209463119506836, "epoch": 0.0887878188649397, "mean_token_accuracy": 0.7568947672843933, "num_tokens": 4943505.0, "step": 898, "train/ce_loss": 0.6351042985916138 }, { "epoch": 0.0887878188649397, "step": 898, "train/sim_loss": 0.0390625 }, { "epoch": 0.0887878188649397, "step": 898, "train/total_loss": 0.10257293283939362 }, { "entropy": 9.021135330200195, "epoch": 0.08888669171445521, "mean_token_accuracy": 0.7314344048500061, "num_tokens": 4949154.0, "step": 899, "train/ce_loss": 0.9543678164482117 }, { "epoch": 0.08888669171445521, "step": 899, "train/sim_loss": 0.14453125 }, { "epoch": 0.08888669171445521, "step": 899, "train/total_loss": 0.23996803164482117 }, { "epoch": 0.08898556456397073, "grad_norm": 1.4774096012115479, "learning_rate": 9.780200761509172e-06, "loss": 0.1759, "step": 900 }, { "entropy": 8.786093711853027, "epoch": 0.08898556456397073, "mean_token_accuracy": 0.7133520245552063, "num_tokens": 4954816.0, "step": 900, "train/ce_loss": 0.5875260233879089 }, { "epoch": 0.08898556456397073, "step": 900, "train/sim_loss": 0.14453125 }, { "epoch": 0.08898556456397073, "step": 900, "train/total_loss": 0.20328384637832642 }, { "entropy": 9.270377159118652, "epoch": 0.08908443741348626, "mean_token_accuracy": 0.7127532958984375, "num_tokens": 4960247.0, "step": 901, "train/ce_loss": 0.8424088358879089 }, { "epoch": 0.08908443741348626, "step": 901, "train/sim_loss": 0.08203125 }, { "epoch": 0.08908443741348626, "step": 901, "train/total_loss": 0.1662721335887909 }, { "entropy": 9.560792922973633, "epoch": 0.08918331026300177, "mean_token_accuracy": 0.7629138827323914, "num_tokens": 4965616.0, "step": 902, "train/ce_loss": 0.7836432456970215 }, { "epoch": 0.08918331026300177, "step": 902, "train/sim_loss": 0.046875 }, { "epoch": 0.08918331026300177, "step": 902, "train/total_loss": 0.1252393275499344 }, { "entropy": 9.093595504760742, "epoch": 0.0892821831125173, "mean_token_accuracy": 0.7497517466545105, "num_tokens": 4971205.0, "step": 903, "train/ce_loss": 0.5199077129364014 }, { "epoch": 0.0892821831125173, "step": 903, "train/sim_loss": 0.0390625 }, { "epoch": 0.0892821831125173, "step": 903, "train/total_loss": 0.09105327725410461 }, { "entropy": 9.072840690612793, "epoch": 0.08938105596203283, "mean_token_accuracy": 0.732570230960846, "num_tokens": 4976789.0, "step": 904, "train/ce_loss": 1.2639224529266357 }, { "epoch": 0.08938105596203283, "step": 904, "train/sim_loss": 0.09765625 }, { "epoch": 0.08938105596203283, "step": 904, "train/total_loss": 0.22404849529266357 }, { "entropy": 9.345746040344238, "epoch": 0.08947992881154834, "mean_token_accuracy": 0.7148102521896362, "num_tokens": 4982231.0, "step": 905, "train/ce_loss": 0.7048497200012207 }, { "epoch": 0.08947992881154834, "step": 905, "train/sim_loss": 0.109375 }, { "epoch": 0.08947992881154834, "step": 905, "train/total_loss": 0.1798599660396576 }, { "entropy": 9.171817779541016, "epoch": 0.08957880166106387, "mean_token_accuracy": 0.798963725566864, "num_tokens": 4987852.0, "step": 906, "train/ce_loss": 0.5430392026901245 }, { "epoch": 0.08957880166106387, "step": 906, "train/sim_loss": 0.0390625 }, { "epoch": 0.08957880166106387, "step": 906, "train/total_loss": 0.09336642175912857 }, { "entropy": 8.983722686767578, "epoch": 0.0896776745105794, "mean_token_accuracy": 0.8099009990692139, "num_tokens": 4993514.0, "step": 907, "train/ce_loss": 0.47942882776260376 }, { "epoch": 0.0896776745105794, "step": 907, "train/sim_loss": 0.03515625 }, { "epoch": 0.0896776745105794, "step": 907, "train/total_loss": 0.0830991342663765 }, { "entropy": 9.40056037902832, "epoch": 0.08977654736009492, "mean_token_accuracy": 0.7212276458740234, "num_tokens": 4998930.0, "step": 908, "train/ce_loss": 0.8803368806838989 }, { "epoch": 0.08977654736009492, "step": 908, "train/sim_loss": 0.0703125 }, { "epoch": 0.08977654736009492, "step": 908, "train/total_loss": 0.15834619104862213 }, { "entropy": 8.911446571350098, "epoch": 0.08987542020961044, "mean_token_accuracy": 0.705012321472168, "num_tokens": 5004723.0, "step": 909, "train/ce_loss": 0.8696721196174622 }, { "epoch": 0.08987542020961044, "step": 909, "train/sim_loss": 0.1171875 }, { "epoch": 0.08987542020961044, "step": 909, "train/total_loss": 0.20415471494197845 }, { "entropy": 9.457520484924316, "epoch": 0.08997429305912596, "mean_token_accuracy": 0.7424623370170593, "num_tokens": 5010086.0, "step": 910, "train/ce_loss": 1.1579904556274414 }, { "epoch": 0.08997429305912596, "step": 910, "train/sim_loss": 0.06640625 }, { "epoch": 0.08997429305912596, "step": 910, "train/total_loss": 0.18220528960227966 }, { "entropy": 9.064043998718262, "epoch": 0.09007316590864149, "mean_token_accuracy": 0.760534405708313, "num_tokens": 5015599.0, "step": 911, "train/ce_loss": 0.707744836807251 }, { "epoch": 0.09007316590864149, "step": 911, "train/sim_loss": 0.09375 }, { "epoch": 0.09007316590864149, "step": 911, "train/total_loss": 0.16452449560165405 }, { "entropy": 9.303609848022461, "epoch": 0.090172038758157, "mean_token_accuracy": 0.7344300746917725, "num_tokens": 5021104.0, "step": 912, "train/ce_loss": 0.9409158229827881 }, { "epoch": 0.090172038758157, "step": 912, "train/sim_loss": 0.1015625 }, { "epoch": 0.090172038758157, "step": 912, "train/total_loss": 0.19565409421920776 }, { "entropy": 9.133035659790039, "epoch": 0.09027091160767253, "mean_token_accuracy": 0.7997967600822449, "num_tokens": 5026930.0, "step": 913, "train/ce_loss": 0.6523807048797607 }, { "epoch": 0.09027091160767253, "step": 913, "train/sim_loss": 0.1328125 }, { "epoch": 0.09027091160767253, "step": 913, "train/total_loss": 0.1980505734682083 }, { "entropy": 9.17814826965332, "epoch": 0.09036978445718806, "mean_token_accuracy": 0.7523148059844971, "num_tokens": 5032384.0, "step": 914, "train/ce_loss": 0.6834198832511902 }, { "epoch": 0.09036978445718806, "step": 914, "train/sim_loss": 0.03515625 }, { "epoch": 0.09036978445718806, "step": 914, "train/total_loss": 0.10349824279546738 }, { "entropy": 9.531439781188965, "epoch": 0.09046865730670357, "mean_token_accuracy": 0.7516512274742126, "num_tokens": 5037708.0, "step": 915, "train/ce_loss": 0.4765205681324005 }, { "epoch": 0.09046865730670357, "step": 915, "train/sim_loss": 0.03515625 }, { "epoch": 0.09046865730670357, "step": 915, "train/total_loss": 0.08280830830335617 }, { "entropy": 9.44800853729248, "epoch": 0.0905675301562191, "mean_token_accuracy": 0.745123565196991, "num_tokens": 5043106.0, "step": 916, "train/ce_loss": 0.790551483631134 }, { "epoch": 0.0905675301562191, "step": 916, "train/sim_loss": 0.09375 }, { "epoch": 0.0905675301562191, "step": 916, "train/total_loss": 0.17280516028404236 }, { "entropy": 9.378689765930176, "epoch": 0.09066640300573463, "mean_token_accuracy": 0.7349397540092468, "num_tokens": 5048514.0, "step": 917, "train/ce_loss": 0.5631740093231201 }, { "epoch": 0.09066640300573463, "step": 917, "train/sim_loss": 0.046875 }, { "epoch": 0.09066640300573463, "step": 917, "train/total_loss": 0.10319240391254425 }, { "entropy": 9.612150192260742, "epoch": 0.09076527585525015, "mean_token_accuracy": 0.7760778665542603, "num_tokens": 5053711.0, "step": 918, "train/ce_loss": 0.6191260814666748 }, { "epoch": 0.09076527585525015, "step": 918, "train/sim_loss": 0.0390625 }, { "epoch": 0.09076527585525015, "step": 918, "train/total_loss": 0.10097511112689972 }, { "entropy": 9.263891220092773, "epoch": 0.09086414870476567, "mean_token_accuracy": 0.7257072329521179, "num_tokens": 5059199.0, "step": 919, "train/ce_loss": 1.1389966011047363 }, { "epoch": 0.09086414870476567, "step": 919, "train/sim_loss": 0.078125 }, { "epoch": 0.09086414870476567, "step": 919, "train/total_loss": 0.19202466309070587 }, { "epoch": 0.0909630215542812, "grad_norm": 1.0394600629806519, "learning_rate": 9.775255896751225e-06, "loss": 0.1696, "step": 920 }, { "entropy": 9.127363204956055, "epoch": 0.0909630215542812, "mean_token_accuracy": 0.7123430967330933, "num_tokens": 5064811.0, "step": 920, "train/ce_loss": 0.645117998123169 }, { "epoch": 0.0909630215542812, "step": 920, "train/sim_loss": 0.1171875 }, { "epoch": 0.0909630215542812, "step": 920, "train/total_loss": 0.18169930577278137 }, { "entropy": 8.647361755371094, "epoch": 0.09106189440379672, "mean_token_accuracy": 0.663922131061554, "num_tokens": 5070752.0, "step": 921, "train/ce_loss": 1.8453338146209717 }, { "epoch": 0.09106189440379672, "step": 921, "train/sim_loss": 0.0703125 }, { "epoch": 0.09106189440379672, "step": 921, "train/total_loss": 0.25484588742256165 }, { "entropy": 9.369365692138672, "epoch": 0.09116076725331224, "mean_token_accuracy": 0.7808219194412231, "num_tokens": 5076120.0, "step": 922, "train/ce_loss": 0.5476551055908203 }, { "epoch": 0.09116076725331224, "step": 922, "train/sim_loss": 0.0390625 }, { "epoch": 0.09116076725331224, "step": 922, "train/total_loss": 0.09382800757884979 }, { "entropy": 8.982980728149414, "epoch": 0.09125964010282776, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 5081797.0, "step": 923, "train/ce_loss": 0.8922906517982483 }, { "epoch": 0.09125964010282776, "step": 923, "train/sim_loss": 0.07421875 }, { "epoch": 0.09125964010282776, "step": 923, "train/total_loss": 0.16344782710075378 }, { "entropy": 9.192646980285645, "epoch": 0.09135851295234329, "mean_token_accuracy": 0.6890756487846375, "num_tokens": 5087297.0, "step": 924, "train/ce_loss": 0.7108357548713684 }, { "epoch": 0.09135851295234329, "step": 924, "train/sim_loss": 0.078125 }, { "epoch": 0.09135851295234329, "step": 924, "train/total_loss": 0.14920857548713684 }, { "entropy": 9.14930534362793, "epoch": 0.0914573858018588, "mean_token_accuracy": 0.7323788404464722, "num_tokens": 5092828.0, "step": 925, "train/ce_loss": 0.8301072716712952 }, { "epoch": 0.0914573858018588, "step": 925, "train/sim_loss": 0.0625 }, { "epoch": 0.0914573858018588, "step": 925, "train/total_loss": 0.145510733127594 }, { "entropy": 9.558605194091797, "epoch": 0.09155625865137433, "mean_token_accuracy": 0.7430555820465088, "num_tokens": 5098171.0, "step": 926, "train/ce_loss": 1.13265061378479 }, { "epoch": 0.09155625865137433, "step": 926, "train/sim_loss": 0.09765625 }, { "epoch": 0.09155625865137433, "step": 926, "train/total_loss": 0.21092131733894348 }, { "entropy": 9.41970443725586, "epoch": 0.09165513150088986, "mean_token_accuracy": 0.7395833134651184, "num_tokens": 5103525.0, "step": 927, "train/ce_loss": 0.6657501459121704 }, { "epoch": 0.09165513150088986, "step": 927, "train/sim_loss": 0.109375 }, { "epoch": 0.09165513150088986, "step": 927, "train/total_loss": 0.17595002055168152 }, { "entropy": 9.128458976745605, "epoch": 0.09175400435040539, "mean_token_accuracy": 0.679711639881134, "num_tokens": 5109138.0, "step": 928, "train/ce_loss": 0.7857465147972107 }, { "epoch": 0.09175400435040539, "step": 928, "train/sim_loss": 0.125 }, { "epoch": 0.09175400435040539, "step": 928, "train/total_loss": 0.20357465744018555 }, { "entropy": 9.198617935180664, "epoch": 0.0918528771999209, "mean_token_accuracy": 0.7369697093963623, "num_tokens": 5114547.0, "step": 929, "train/ce_loss": 0.9122666716575623 }, { "epoch": 0.0918528771999209, "step": 929, "train/sim_loss": 0.09765625 }, { "epoch": 0.0918528771999209, "step": 929, "train/total_loss": 0.18888291716575623 }, { "entropy": 9.121241569519043, "epoch": 0.09195175004943643, "mean_token_accuracy": 0.7629362344741821, "num_tokens": 5120051.0, "step": 930, "train/ce_loss": 0.7849425673484802 }, { "epoch": 0.09195175004943643, "step": 930, "train/sim_loss": 0.16015625 }, { "epoch": 0.09195175004943643, "step": 930, "train/total_loss": 0.23865050077438354 }, { "entropy": 9.438669204711914, "epoch": 0.09205062289895195, "mean_token_accuracy": 0.7457846999168396, "num_tokens": 5125631.0, "step": 931, "train/ce_loss": 0.49279335141181946 }, { "epoch": 0.09205062289895195, "step": 931, "train/sim_loss": 0.13671875 }, { "epoch": 0.09205062289895195, "step": 931, "train/total_loss": 0.1859980821609497 }, { "entropy": 9.447044372558594, "epoch": 0.09214949574846747, "mean_token_accuracy": 0.7235984206199646, "num_tokens": 5130981.0, "step": 932, "train/ce_loss": 0.6512894034385681 }, { "epoch": 0.09214949574846747, "step": 932, "train/sim_loss": 0.109375 }, { "epoch": 0.09214949574846747, "step": 932, "train/total_loss": 0.17450395226478577 }, { "entropy": 8.822547912597656, "epoch": 0.092248368597983, "mean_token_accuracy": 0.6531049013137817, "num_tokens": 5136422.0, "step": 933, "train/ce_loss": 0.5236958265304565 }, { "epoch": 0.092248368597983, "step": 933, "train/sim_loss": 0.078125 }, { "epoch": 0.092248368597983, "step": 933, "train/total_loss": 0.13049457967281342 }, { "entropy": 9.45274543762207, "epoch": 0.09234724144749852, "mean_token_accuracy": 0.752043604850769, "num_tokens": 5141745.0, "step": 934, "train/ce_loss": 1.08139967918396 }, { "epoch": 0.09234724144749852, "step": 934, "train/sim_loss": 0.0546875 }, { "epoch": 0.09234724144749852, "step": 934, "train/total_loss": 0.16282746195793152 }, { "entropy": 9.439166069030762, "epoch": 0.09244611429701403, "mean_token_accuracy": 0.7403246164321899, "num_tokens": 5147151.0, "step": 935, "train/ce_loss": 0.7453115582466125 }, { "epoch": 0.09244611429701403, "step": 935, "train/sim_loss": 0.09765625 }, { "epoch": 0.09244611429701403, "step": 935, "train/total_loss": 0.1721874177455902 }, { "entropy": 8.53639030456543, "epoch": 0.09254498714652956, "mean_token_accuracy": 0.7821522355079651, "num_tokens": 5152952.0, "step": 936, "train/ce_loss": 0.33763399720191956 }, { "epoch": 0.09254498714652956, "step": 936, "train/sim_loss": 0.1015625 }, { "epoch": 0.09254498714652956, "step": 936, "train/total_loss": 0.13532590866088867 }, { "entropy": 9.567885398864746, "epoch": 0.09264385999604509, "mean_token_accuracy": 0.7405247688293457, "num_tokens": 5158237.0, "step": 937, "train/ce_loss": 0.48259633779525757 }, { "epoch": 0.09264385999604509, "step": 937, "train/sim_loss": 0.09375 }, { "epoch": 0.09264385999604509, "step": 937, "train/total_loss": 0.14200963079929352 }, { "entropy": 9.160867691040039, "epoch": 0.09274273284556062, "mean_token_accuracy": 0.7353603839874268, "num_tokens": 5163793.0, "step": 938, "train/ce_loss": 0.6538066267967224 }, { "epoch": 0.09274273284556062, "step": 938, "train/sim_loss": 0.14453125 }, { "epoch": 0.09274273284556062, "step": 938, "train/total_loss": 0.20991191267967224 }, { "entropy": 9.085721969604492, "epoch": 0.09284160569507613, "mean_token_accuracy": 0.8014861941337585, "num_tokens": 5169305.0, "step": 939, "train/ce_loss": 0.5394171476364136 }, { "epoch": 0.09284160569507613, "step": 939, "train/sim_loss": 0.05078125 }, { "epoch": 0.09284160569507613, "step": 939, "train/total_loss": 0.10472296178340912 }, { "epoch": 0.09294047854459166, "grad_norm": 0.9492486119270325, "learning_rate": 9.770311031993277e-06, "loss": 0.1801, "step": 940 }, { "entropy": 8.96943473815918, "epoch": 0.09294047854459166, "mean_token_accuracy": 0.7811791300773621, "num_tokens": 5174837.0, "step": 940, "train/ce_loss": 0.5610426664352417 }, { "epoch": 0.09294047854459166, "step": 940, "train/sim_loss": 0.1015625 }, { "epoch": 0.09294047854459166, "step": 940, "train/total_loss": 0.15766677260398865 }, { "entropy": 8.92251205444336, "epoch": 0.09303935139410718, "mean_token_accuracy": 0.710629940032959, "num_tokens": 5180358.0, "step": 941, "train/ce_loss": 1.9084241390228271 }, { "epoch": 0.09303935139410718, "step": 941, "train/sim_loss": 0.078125 }, { "epoch": 0.09303935139410718, "step": 941, "train/total_loss": 0.2689674198627472 }, { "entropy": 9.022502899169922, "epoch": 0.0931382242436227, "mean_token_accuracy": 0.8042035102844238, "num_tokens": 5185906.0, "step": 942, "train/ce_loss": 0.5097102522850037 }, { "epoch": 0.0931382242436227, "step": 942, "train/sim_loss": 0.04296875 }, { "epoch": 0.0931382242436227, "step": 942, "train/total_loss": 0.09393978118896484 }, { "entropy": 9.104101181030273, "epoch": 0.09323709709313822, "mean_token_accuracy": 0.7704917788505554, "num_tokens": 5191446.0, "step": 943, "train/ce_loss": 0.5316740870475769 }, { "epoch": 0.09323709709313822, "step": 943, "train/sim_loss": 0.03515625 }, { "epoch": 0.09323709709313822, "step": 943, "train/total_loss": 0.08832366019487381 }, { "entropy": 8.997488021850586, "epoch": 0.09333596994265375, "mean_token_accuracy": 0.7355035543441772, "num_tokens": 5196966.0, "step": 944, "train/ce_loss": 0.734092116355896 }, { "epoch": 0.09333596994265375, "step": 944, "train/sim_loss": 0.02734375 }, { "epoch": 0.09333596994265375, "step": 944, "train/total_loss": 0.10075296461582184 }, { "entropy": 9.354175567626953, "epoch": 0.09343484279216926, "mean_token_accuracy": 0.7314931154251099, "num_tokens": 5202381.0, "step": 945, "train/ce_loss": 0.6806911826133728 }, { "epoch": 0.09343484279216926, "step": 945, "train/sim_loss": 0.0859375 }, { "epoch": 0.09343484279216926, "step": 945, "train/total_loss": 0.15400663018226624 }, { "entropy": 9.731459617614746, "epoch": 0.09353371564168479, "mean_token_accuracy": 0.7901861071586609, "num_tokens": 5207612.0, "step": 946, "train/ce_loss": 0.7627028822898865 }, { "epoch": 0.09353371564168479, "step": 946, "train/sim_loss": 0.06640625 }, { "epoch": 0.09353371564168479, "step": 946, "train/total_loss": 0.14267653226852417 }, { "entropy": 8.66770076751709, "epoch": 0.09363258849120032, "mean_token_accuracy": 0.7324073910713196, "num_tokens": 5213312.0, "step": 947, "train/ce_loss": 0.8039005994796753 }, { "epoch": 0.09363258849120032, "step": 947, "train/sim_loss": 0.109375 }, { "epoch": 0.09363258849120032, "step": 947, "train/total_loss": 0.189765065908432 }, { "entropy": 8.977514266967773, "epoch": 0.09373146134071583, "mean_token_accuracy": 0.7737962007522583, "num_tokens": 5218697.0, "step": 948, "train/ce_loss": 0.9853711128234863 }, { "epoch": 0.09373146134071583, "step": 948, "train/sim_loss": 0.08203125 }, { "epoch": 0.09373146134071583, "step": 948, "train/total_loss": 0.1805683672428131 }, { "entropy": 9.04878044128418, "epoch": 0.09383033419023136, "mean_token_accuracy": 0.6941297650337219, "num_tokens": 5224290.0, "step": 949, "train/ce_loss": 0.5475206971168518 }, { "epoch": 0.09383033419023136, "step": 949, "train/sim_loss": 0.07421875 }, { "epoch": 0.09383033419023136, "step": 949, "train/total_loss": 0.12897081673145294 }, { "entropy": 8.81050968170166, "epoch": 0.09392920703974689, "mean_token_accuracy": 0.7380281686782837, "num_tokens": 5230003.0, "step": 950, "train/ce_loss": 1.2810709476470947 }, { "epoch": 0.09392920703974689, "step": 950, "train/sim_loss": 0.0859375 }, { "epoch": 0.09392920703974689, "step": 950, "train/total_loss": 0.21404460072517395 }, { "entropy": 8.944576263427734, "epoch": 0.09402807988926241, "mean_token_accuracy": 0.7778874635696411, "num_tokens": 5235684.0, "step": 951, "train/ce_loss": 0.5252327919006348 }, { "epoch": 0.09402807988926241, "step": 951, "train/sim_loss": 0.04296875 }, { "epoch": 0.09402807988926241, "step": 951, "train/total_loss": 0.09549203515052795 }, { "entropy": 9.284594535827637, "epoch": 0.09412695273877793, "mean_token_accuracy": 0.7262638807296753, "num_tokens": 5241028.0, "step": 952, "train/ce_loss": 0.8699063062667847 }, { "epoch": 0.09412695273877793, "step": 952, "train/sim_loss": 0.08984375 }, { "epoch": 0.09412695273877793, "step": 952, "train/total_loss": 0.176834374666214 }, { "entropy": 9.168868064880371, "epoch": 0.09422582558829345, "mean_token_accuracy": 0.7773109078407288, "num_tokens": 5246555.0, "step": 953, "train/ce_loss": 0.9263587594032288 }, { "epoch": 0.09422582558829345, "step": 953, "train/sim_loss": 0.0703125 }, { "epoch": 0.09422582558829345, "step": 953, "train/total_loss": 0.1629483699798584 }, { "entropy": 9.054646492004395, "epoch": 0.09432469843780898, "mean_token_accuracy": 0.682758629322052, "num_tokens": 5252100.0, "step": 954, "train/ce_loss": 1.2967135906219482 }, { "epoch": 0.09432469843780898, "step": 954, "train/sim_loss": 0.1171875 }, { "epoch": 0.09432469843780898, "step": 954, "train/total_loss": 0.2468588650226593 }, { "entropy": 9.244196891784668, "epoch": 0.0944235712873245, "mean_token_accuracy": 0.7011628150939941, "num_tokens": 5257562.0, "step": 955, "train/ce_loss": 0.6307183504104614 }, { "epoch": 0.0944235712873245, "step": 955, "train/sim_loss": 0.08203125 }, { "epoch": 0.0944235712873245, "step": 955, "train/total_loss": 0.1451030969619751 }, { "entropy": 8.818082809448242, "epoch": 0.09452244413684002, "mean_token_accuracy": 0.764102578163147, "num_tokens": 5263339.0, "step": 956, "train/ce_loss": 0.6101877689361572 }, { "epoch": 0.09452244413684002, "step": 956, "train/sim_loss": 0.06640625 }, { "epoch": 0.09452244413684002, "step": 956, "train/total_loss": 0.12742502987384796 }, { "entropy": 9.516603469848633, "epoch": 0.09462131698635555, "mean_token_accuracy": 0.7361111044883728, "num_tokens": 5268681.0, "step": 957, "train/ce_loss": 0.7101122736930847 }, { "epoch": 0.09462131698635555, "step": 957, "train/sim_loss": 0.09375 }, { "epoch": 0.09462131698635555, "step": 957, "train/total_loss": 0.1647612303495407 }, { "entropy": 9.222953796386719, "epoch": 0.09472018983587106, "mean_token_accuracy": 0.7511211037635803, "num_tokens": 5274170.0, "step": 958, "train/ce_loss": 0.7736159563064575 }, { "epoch": 0.09472018983587106, "step": 958, "train/sim_loss": 0.1171875 }, { "epoch": 0.09472018983587106, "step": 958, "train/total_loss": 0.194549098610878 }, { "entropy": 9.260988235473633, "epoch": 0.09481906268538659, "mean_token_accuracy": 0.7170542478561401, "num_tokens": 5279529.0, "step": 959, "train/ce_loss": 0.7015955448150635 }, { "epoch": 0.09481906268538659, "step": 959, "train/sim_loss": 0.08984375 }, { "epoch": 0.09481906268538659, "step": 959, "train/total_loss": 0.16000330448150635 }, { "epoch": 0.09491793553490212, "grad_norm": 1.1465083360671997, "learning_rate": 9.765366167235327e-06, "loss": 0.1732, "step": 960 }, { "entropy": 9.23489761352539, "epoch": 0.09491793553490212, "mean_token_accuracy": 0.7270668148994446, "num_tokens": 5285197.0, "step": 960, "train/ce_loss": 0.703809380531311 }, { "epoch": 0.09491793553490212, "step": 960, "train/sim_loss": 0.1171875 }, { "epoch": 0.09491793553490212, "step": 960, "train/total_loss": 0.18756844103336334 }, { "entropy": 9.606468200683594, "epoch": 0.09501680838441764, "mean_token_accuracy": 0.7201907634735107, "num_tokens": 5290387.0, "step": 961, "train/ce_loss": 0.8279499411582947 }, { "epoch": 0.09501680838441764, "step": 961, "train/sim_loss": 0.0859375 }, { "epoch": 0.09501680838441764, "step": 961, "train/total_loss": 0.16873249411582947 }, { "entropy": 9.239270210266113, "epoch": 0.09511568123393316, "mean_token_accuracy": 0.7795484662055969, "num_tokens": 5295772.0, "step": 962, "train/ce_loss": 0.5667407512664795 }, { "epoch": 0.09511568123393316, "step": 962, "train/sim_loss": 0.03515625 }, { "epoch": 0.09511568123393316, "step": 962, "train/total_loss": 0.09183032810688019 }, { "entropy": 9.346488952636719, "epoch": 0.09521455408344869, "mean_token_accuracy": 0.7718040347099304, "num_tokens": 5301241.0, "step": 963, "train/ce_loss": 0.6200066208839417 }, { "epoch": 0.09521455408344869, "step": 963, "train/sim_loss": 0.1015625 }, { "epoch": 0.09521455408344869, "step": 963, "train/total_loss": 0.16356316208839417 }, { "entropy": 9.220712661743164, "epoch": 0.09531342693296421, "mean_token_accuracy": 0.7256097793579102, "num_tokens": 5306719.0, "step": 964, "train/ce_loss": 0.9973124265670776 }, { "epoch": 0.09531342693296421, "step": 964, "train/sim_loss": 0.1015625 }, { "epoch": 0.09531342693296421, "step": 964, "train/total_loss": 0.2012937366962433 }, { "entropy": 9.433728218078613, "epoch": 0.09541229978247973, "mean_token_accuracy": 0.7595269680023193, "num_tokens": 5312105.0, "step": 965, "train/ce_loss": 0.763810932636261 }, { "epoch": 0.09541229978247973, "step": 965, "train/sim_loss": 0.0625 }, { "epoch": 0.09541229978247973, "step": 965, "train/total_loss": 0.13888108730316162 }, { "entropy": 9.073366165161133, "epoch": 0.09551117263199525, "mean_token_accuracy": 0.7045235633850098, "num_tokens": 5317764.0, "step": 966, "train/ce_loss": 0.4062463045120239 }, { "epoch": 0.09551117263199525, "step": 966, "train/sim_loss": 0.10546875 }, { "epoch": 0.09551117263199525, "step": 966, "train/total_loss": 0.14609338343143463 }, { "entropy": 9.536725044250488, "epoch": 0.09561004548151078, "mean_token_accuracy": 0.7832258343696594, "num_tokens": 5323059.0, "step": 967, "train/ce_loss": 0.5602385401725769 }, { "epoch": 0.09561004548151078, "step": 967, "train/sim_loss": 0.0390625 }, { "epoch": 0.09561004548151078, "step": 967, "train/total_loss": 0.09508635103702545 }, { "entropy": 9.140849113464355, "epoch": 0.0957089183310263, "mean_token_accuracy": 0.7138755917549133, "num_tokens": 5328619.0, "step": 968, "train/ce_loss": 0.4790591597557068 }, { "epoch": 0.0957089183310263, "step": 968, "train/sim_loss": 0.03125 }, { "epoch": 0.0957089183310263, "step": 968, "train/total_loss": 0.07915592193603516 }, { "entropy": 9.143475532531738, "epoch": 0.09580779118054182, "mean_token_accuracy": 0.6986755132675171, "num_tokens": 5334128.0, "step": 969, "train/ce_loss": 1.1765937805175781 }, { "epoch": 0.09580779118054182, "step": 969, "train/sim_loss": 0.03515625 }, { "epoch": 0.09580779118054182, "step": 969, "train/total_loss": 0.15281563997268677 }, { "entropy": 9.14988899230957, "epoch": 0.09590666403005735, "mean_token_accuracy": 0.7525423765182495, "num_tokens": 5339705.0, "step": 970, "train/ce_loss": 0.7445306777954102 }, { "epoch": 0.09590666403005735, "step": 970, "train/sim_loss": 0.1171875 }, { "epoch": 0.09590666403005735, "step": 970, "train/total_loss": 0.19164057075977325 }, { "entropy": 9.233036994934082, "epoch": 0.09600553687957288, "mean_token_accuracy": 0.6913978457450867, "num_tokens": 5345231.0, "step": 971, "train/ce_loss": 1.8594807386398315 }, { "epoch": 0.09600553687957288, "step": 971, "train/sim_loss": 0.09375 }, { "epoch": 0.09600553687957288, "step": 971, "train/total_loss": 0.27969807386398315 }, { "entropy": 9.355999946594238, "epoch": 0.09610440972908839, "mean_token_accuracy": 0.7736351490020752, "num_tokens": 5350622.0, "step": 972, "train/ce_loss": 0.4873180091381073 }, { "epoch": 0.09610440972908839, "step": 972, "train/sim_loss": 0.1171875 }, { "epoch": 0.09610440972908839, "step": 972, "train/total_loss": 0.16591930389404297 }, { "entropy": 9.00129222869873, "epoch": 0.09620328257860392, "mean_token_accuracy": 0.6757263541221619, "num_tokens": 5356346.0, "step": 973, "train/ce_loss": 0.7037826776504517 }, { "epoch": 0.09620328257860392, "step": 973, "train/sim_loss": 0.1328125 }, { "epoch": 0.09620328257860392, "step": 973, "train/total_loss": 0.20319077372550964 }, { "entropy": 9.549809455871582, "epoch": 0.09630215542811944, "mean_token_accuracy": 0.7621082663536072, "num_tokens": 5361555.0, "step": 974, "train/ce_loss": 0.5977106094360352 }, { "epoch": 0.09630215542811944, "step": 974, "train/sim_loss": 0.10546875 }, { "epoch": 0.09630215542811944, "step": 974, "train/total_loss": 0.16523981094360352 }, { "entropy": 9.140820503234863, "epoch": 0.09640102827763496, "mean_token_accuracy": 0.7457795143127441, "num_tokens": 5367116.0, "step": 975, "train/ce_loss": 0.5582002997398376 }, { "epoch": 0.09640102827763496, "step": 975, "train/sim_loss": 0.09765625 }, { "epoch": 0.09640102827763496, "step": 975, "train/total_loss": 0.153476282954216 }, { "entropy": 9.298966407775879, "epoch": 0.09649990112715048, "mean_token_accuracy": 0.7214723825454712, "num_tokens": 5372527.0, "step": 976, "train/ce_loss": 0.5232043862342834 }, { "epoch": 0.09649990112715048, "step": 976, "train/sim_loss": 0.08984375 }, { "epoch": 0.09649990112715048, "step": 976, "train/total_loss": 0.1421641856431961 }, { "entropy": 9.124370574951172, "epoch": 0.09659877397666601, "mean_token_accuracy": 0.7331863045692444, "num_tokens": 5378049.0, "step": 977, "train/ce_loss": 0.5762178897857666 }, { "epoch": 0.09659877397666601, "step": 977, "train/sim_loss": 0.046875 }, { "epoch": 0.09659877397666601, "step": 977, "train/total_loss": 0.1044967919588089 }, { "entropy": 9.30836296081543, "epoch": 0.09669764682618152, "mean_token_accuracy": 0.7915151715278625, "num_tokens": 5383405.0, "step": 978, "train/ce_loss": 0.6277317404747009 }, { "epoch": 0.09669764682618152, "step": 978, "train/sim_loss": 0.046875 }, { "epoch": 0.09669764682618152, "step": 978, "train/total_loss": 0.10964817553758621 }, { "entropy": 9.226985931396484, "epoch": 0.09679651967569705, "mean_token_accuracy": 0.7758817076683044, "num_tokens": 5388859.0, "step": 979, "train/ce_loss": 0.9209808111190796 }, { "epoch": 0.09679651967569705, "step": 979, "train/sim_loss": 0.1015625 }, { "epoch": 0.09679651967569705, "step": 979, "train/total_loss": 0.19366058707237244 }, { "epoch": 0.09689539252521258, "grad_norm": 0.9271367192268372, "learning_rate": 9.760421302477378e-06, "loss": 0.1702, "step": 980 }, { "entropy": 9.027633666992188, "epoch": 0.09689539252521258, "mean_token_accuracy": 0.7078521847724915, "num_tokens": 5394292.0, "step": 980, "train/ce_loss": 0.5923965573310852 }, { "epoch": 0.09689539252521258, "step": 980, "train/sim_loss": 0.078125 }, { "epoch": 0.09689539252521258, "step": 980, "train/total_loss": 0.13736465573310852 }, { "entropy": 8.881030082702637, "epoch": 0.0969942653747281, "mean_token_accuracy": 0.7384324669837952, "num_tokens": 5399876.0, "step": 981, "train/ce_loss": 1.718798279762268 }, { "epoch": 0.0969942653747281, "step": 981, "train/sim_loss": 0.10546875 }, { "epoch": 0.0969942653747281, "step": 981, "train/total_loss": 0.2773485779762268 }, { "entropy": 9.16212272644043, "epoch": 0.09709313822424362, "mean_token_accuracy": 0.7311475276947021, "num_tokens": 5405381.0, "step": 982, "train/ce_loss": 0.7305068969726562 }, { "epoch": 0.09709313822424362, "step": 982, "train/sim_loss": 0.08203125 }, { "epoch": 0.09709313822424362, "step": 982, "train/total_loss": 0.15508194267749786 }, { "entropy": 9.358848571777344, "epoch": 0.09719201107375915, "mean_token_accuracy": 0.7743055820465088, "num_tokens": 5410818.0, "step": 983, "train/ce_loss": 0.40454065799713135 }, { "epoch": 0.09719201107375915, "step": 983, "train/sim_loss": 0.046875 }, { "epoch": 0.09719201107375915, "step": 983, "train/total_loss": 0.08732906728982925 }, { "entropy": 9.230178833007812, "epoch": 0.09729088392327467, "mean_token_accuracy": 0.6777905821800232, "num_tokens": 5416364.0, "step": 984, "train/ce_loss": 0.9663439989089966 }, { "epoch": 0.09729088392327467, "step": 984, "train/sim_loss": 0.109375 }, { "epoch": 0.09729088392327467, "step": 984, "train/total_loss": 0.2060094028711319 }, { "entropy": 9.239079475402832, "epoch": 0.09738975677279019, "mean_token_accuracy": 0.781737208366394, "num_tokens": 5421883.0, "step": 985, "train/ce_loss": 1.1260815858840942 }, { "epoch": 0.09738975677279019, "step": 985, "train/sim_loss": 0.03515625 }, { "epoch": 0.09738975677279019, "step": 985, "train/total_loss": 0.1477644145488739 }, { "entropy": 9.474618911743164, "epoch": 0.09748862962230571, "mean_token_accuracy": 0.7397769689559937, "num_tokens": 5427314.0, "step": 986, "train/ce_loss": 0.9278387427330017 }, { "epoch": 0.09748862962230571, "step": 986, "train/sim_loss": 0.046875 }, { "epoch": 0.09748862962230571, "step": 986, "train/total_loss": 0.1396588683128357 }, { "entropy": 8.9173002243042, "epoch": 0.09758750247182124, "mean_token_accuracy": 0.7764706015586853, "num_tokens": 5433115.0, "step": 987, "train/ce_loss": 0.9935566782951355 }, { "epoch": 0.09758750247182124, "step": 987, "train/sim_loss": 0.16015625 }, { "epoch": 0.09758750247182124, "step": 987, "train/total_loss": 0.25951191782951355 }, { "entropy": 9.410273551940918, "epoch": 0.09768637532133675, "mean_token_accuracy": 0.7679012417793274, "num_tokens": 5438569.0, "step": 988, "train/ce_loss": 0.7858358025550842 }, { "epoch": 0.09768637532133675, "step": 988, "train/sim_loss": 0.11328125 }, { "epoch": 0.09768637532133675, "step": 988, "train/total_loss": 0.19186483323574066 }, { "entropy": 9.432334899902344, "epoch": 0.09778524817085228, "mean_token_accuracy": 0.740024209022522, "num_tokens": 5444063.0, "step": 989, "train/ce_loss": 0.675076961517334 }, { "epoch": 0.09778524817085228, "step": 989, "train/sim_loss": 0.109375 }, { "epoch": 0.09778524817085228, "step": 989, "train/total_loss": 0.17688269913196564 }, { "entropy": 9.694771766662598, "epoch": 0.09788412102036781, "mean_token_accuracy": 0.7058011293411255, "num_tokens": 5449394.0, "step": 990, "train/ce_loss": 1.1351745128631592 }, { "epoch": 0.09788412102036781, "step": 990, "train/sim_loss": 0.08203125 }, { "epoch": 0.09788412102036781, "step": 990, "train/total_loss": 0.19554871320724487 }, { "entropy": 9.498449325561523, "epoch": 0.09798299386988334, "mean_token_accuracy": 0.7139561772346497, "num_tokens": 5454836.0, "step": 991, "train/ce_loss": 0.6707887053489685 }, { "epoch": 0.09798299386988334, "step": 991, "train/sim_loss": 0.15234375 }, { "epoch": 0.09798299386988334, "step": 991, "train/total_loss": 0.2194226235151291 }, { "entropy": 9.350942611694336, "epoch": 0.09808186671939885, "mean_token_accuracy": 0.7160220742225647, "num_tokens": 5460386.0, "step": 992, "train/ce_loss": 0.7591890096664429 }, { "epoch": 0.09808186671939885, "step": 992, "train/sim_loss": 0.11328125 }, { "epoch": 0.09808186671939885, "step": 992, "train/total_loss": 0.18920016288757324 }, { "entropy": 9.014382362365723, "epoch": 0.09818073956891438, "mean_token_accuracy": 0.7463414669036865, "num_tokens": 5465998.0, "step": 993, "train/ce_loss": 0.596236526966095 }, { "epoch": 0.09818073956891438, "step": 993, "train/sim_loss": 0.1484375 }, { "epoch": 0.09818073956891438, "step": 993, "train/total_loss": 0.20806115865707397 }, { "entropy": 9.546076774597168, "epoch": 0.0982796124184299, "mean_token_accuracy": 0.7615979313850403, "num_tokens": 5471357.0, "step": 994, "train/ce_loss": 0.6488422155380249 }, { "epoch": 0.0982796124184299, "step": 994, "train/sim_loss": 0.04296875 }, { "epoch": 0.0982796124184299, "step": 994, "train/total_loss": 0.10785297304391861 }, { "entropy": 9.660165786743164, "epoch": 0.09837848526794542, "mean_token_accuracy": 0.7424836754798889, "num_tokens": 5476668.0, "step": 995, "train/ce_loss": 0.7934048771858215 }, { "epoch": 0.09837848526794542, "step": 995, "train/sim_loss": 0.03125 }, { "epoch": 0.09837848526794542, "step": 995, "train/total_loss": 0.11059048771858215 }, { "entropy": 9.503032684326172, "epoch": 0.09847735811746094, "mean_token_accuracy": 0.7277556657791138, "num_tokens": 5482052.0, "step": 996, "train/ce_loss": 0.8745565414428711 }, { "epoch": 0.09847735811746094, "step": 996, "train/sim_loss": 0.18359375 }, { "epoch": 0.09847735811746094, "step": 996, "train/total_loss": 0.2710494101047516 }, { "entropy": 9.305106163024902, "epoch": 0.09857623096697647, "mean_token_accuracy": 0.7528868317604065, "num_tokens": 5487556.0, "step": 997, "train/ce_loss": 0.8089562058448792 }, { "epoch": 0.09857623096697647, "step": 997, "train/sim_loss": 0.12890625 }, { "epoch": 0.09857623096697647, "step": 997, "train/total_loss": 0.20980188250541687 }, { "entropy": 9.121612548828125, "epoch": 0.09867510381649199, "mean_token_accuracy": 0.72926265001297, "num_tokens": 5493107.0, "step": 998, "train/ce_loss": 0.7956136465072632 }, { "epoch": 0.09867510381649199, "step": 998, "train/sim_loss": 0.0703125 }, { "epoch": 0.09867510381649199, "step": 998, "train/total_loss": 0.14987386763095856 }, { "entropy": 9.185770034790039, "epoch": 0.09877397666600751, "mean_token_accuracy": 0.7130712270736694, "num_tokens": 5498723.0, "step": 999, "train/ce_loss": 1.1072075366973877 }, { "epoch": 0.09877397666600751, "step": 999, "train/sim_loss": 0.046875 }, { "epoch": 0.09877397666600751, "step": 999, "train/total_loss": 0.15759575366973877 }, { "epoch": 0.09887284951552304, "grad_norm": 1.092203974723816, "learning_rate": 9.755476437719428e-06, "loss": 0.1718, "step": 1000 }, { "entropy": 9.252190589904785, "epoch": 0.09887284951552304, "mean_token_accuracy": 0.7549120783805847, "num_tokens": 5504307.0, "step": 1000, "train/ce_loss": 0.9919917583465576 }, { "epoch": 0.09887284951552304, "step": 1000, "train/sim_loss": 0.09375 }, { "epoch": 0.09887284951552304, "step": 1000, "train/total_loss": 0.19294917583465576 }, { "entropy": 9.175333976745605, "epoch": 0.09897172236503857, "mean_token_accuracy": 0.773888349533081, "num_tokens": 5509948.0, "step": 1001, "train/ce_loss": 0.6090295910835266 }, { "epoch": 0.09897172236503857, "step": 1001, "train/sim_loss": 0.08984375 }, { "epoch": 0.09897172236503857, "step": 1001, "train/total_loss": 0.15074670314788818 }, { "entropy": 9.540945053100586, "epoch": 0.09907059521455408, "mean_token_accuracy": 0.7576530575752258, "num_tokens": 5515322.0, "step": 1002, "train/ce_loss": 0.7776941657066345 }, { "epoch": 0.09907059521455408, "step": 1002, "train/sim_loss": 0.0859375 }, { "epoch": 0.09907059521455408, "step": 1002, "train/total_loss": 0.1637069284915924 }, { "entropy": 9.213576316833496, "epoch": 0.09916946806406961, "mean_token_accuracy": 0.6882416605949402, "num_tokens": 5520824.0, "step": 1003, "train/ce_loss": 0.7410731911659241 }, { "epoch": 0.09916946806406961, "step": 1003, "train/sim_loss": 0.078125 }, { "epoch": 0.09916946806406961, "step": 1003, "train/total_loss": 0.1522323191165924 }, { "entropy": 9.188959121704102, "epoch": 0.09926834091358513, "mean_token_accuracy": 0.7379454970359802, "num_tokens": 5526388.0, "step": 1004, "train/ce_loss": 0.6065735220909119 }, { "epoch": 0.09926834091358513, "step": 1004, "train/sim_loss": 0.0390625 }, { "epoch": 0.09926834091358513, "step": 1004, "train/total_loss": 0.09971985220909119 }, { "entropy": 8.973281860351562, "epoch": 0.09936721376310065, "mean_token_accuracy": 0.7534112930297852, "num_tokens": 5532039.0, "step": 1005, "train/ce_loss": 0.8345208168029785 }, { "epoch": 0.09936721376310065, "step": 1005, "train/sim_loss": 0.08984375 }, { "epoch": 0.09936721376310065, "step": 1005, "train/total_loss": 0.17329582571983337 }, { "entropy": 9.375436782836914, "epoch": 0.09946608661261618, "mean_token_accuracy": 0.7316769957542419, "num_tokens": 5537383.0, "step": 1006, "train/ce_loss": 0.9049166440963745 }, { "epoch": 0.09946608661261618, "step": 1006, "train/sim_loss": 0.1015625 }, { "epoch": 0.09946608661261618, "step": 1006, "train/total_loss": 0.1920541673898697 }, { "entropy": 9.071906089782715, "epoch": 0.0995649594621317, "mean_token_accuracy": 0.7238723635673523, "num_tokens": 5542987.0, "step": 1007, "train/ce_loss": 1.1356496810913086 }, { "epoch": 0.0995649594621317, "step": 1007, "train/sim_loss": 0.10546875 }, { "epoch": 0.0995649594621317, "step": 1007, "train/total_loss": 0.21903371810913086 }, { "entropy": 9.699291229248047, "epoch": 0.09966383231164722, "mean_token_accuracy": 0.7104136943817139, "num_tokens": 5548256.0, "step": 1008, "train/ce_loss": 0.5801214575767517 }, { "epoch": 0.09966383231164722, "step": 1008, "train/sim_loss": 0.0703125 }, { "epoch": 0.09966383231164722, "step": 1008, "train/total_loss": 0.12832464277744293 }, { "entropy": 9.342121124267578, "epoch": 0.09976270516116274, "mean_token_accuracy": 0.782608687877655, "num_tokens": 5553575.0, "step": 1009, "train/ce_loss": 0.531728208065033 }, { "epoch": 0.09976270516116274, "step": 1009, "train/sim_loss": 0.0546875 }, { "epoch": 0.09976270516116274, "step": 1009, "train/total_loss": 0.10786032676696777 }, { "entropy": 9.505095481872559, "epoch": 0.09986157801067827, "mean_token_accuracy": 0.7108280062675476, "num_tokens": 5558895.0, "step": 1010, "train/ce_loss": 1.010298490524292 }, { "epoch": 0.09986157801067827, "step": 1010, "train/sim_loss": 0.13671875 }, { "epoch": 0.09986157801067827, "step": 1010, "train/total_loss": 0.23774859309196472 }, { "entropy": 9.129598617553711, "epoch": 0.0999604508601938, "mean_token_accuracy": 0.7466174364089966, "num_tokens": 5564359.0, "step": 1011, "train/ce_loss": 0.9578565359115601 }, { "epoch": 0.0999604508601938, "step": 1011, "train/sim_loss": 0.1484375 }, { "epoch": 0.0999604508601938, "step": 1011, "train/total_loss": 0.24422314763069153 }, { "entropy": 9.584781646728516, "epoch": 0.10005932370970931, "mean_token_accuracy": 0.7216066718101501, "num_tokens": 5569691.0, "step": 1012, "train/ce_loss": 0.667327880859375 }, { "epoch": 0.10005932370970931, "step": 1012, "train/sim_loss": 0.05078125 }, { "epoch": 0.10005932370970931, "step": 1012, "train/total_loss": 0.11751403659582138 }, { "entropy": 8.792314529418945, "epoch": 0.10015819655922484, "mean_token_accuracy": 0.7579564452171326, "num_tokens": 5575537.0, "step": 1013, "train/ce_loss": 0.5467829704284668 }, { "epoch": 0.10015819655922484, "step": 1013, "train/sim_loss": 0.03515625 }, { "epoch": 0.10015819655922484, "step": 1013, "train/total_loss": 0.0898345485329628 }, { "entropy": 9.362556457519531, "epoch": 0.10025706940874037, "mean_token_accuracy": 0.7353308200836182, "num_tokens": 5580996.0, "step": 1014, "train/ce_loss": 0.6697930097579956 }, { "epoch": 0.10025706940874037, "step": 1014, "train/sim_loss": 0.09375 }, { "epoch": 0.10025706940874037, "step": 1014, "train/total_loss": 0.1607293039560318 }, { "entropy": 9.329007148742676, "epoch": 0.10035594225825588, "mean_token_accuracy": 0.7486573457717896, "num_tokens": 5586688.0, "step": 1015, "train/ce_loss": 0.3998299539089203 }, { "epoch": 0.10035594225825588, "step": 1015, "train/sim_loss": 0.03515625 }, { "epoch": 0.10035594225825588, "step": 1015, "train/total_loss": 0.07513924688100815 }, { "entropy": 9.11518669128418, "epoch": 0.1004548151077714, "mean_token_accuracy": 0.6745623350143433, "num_tokens": 5592303.0, "step": 1016, "train/ce_loss": 0.8024645447731018 }, { "epoch": 0.1004548151077714, "step": 1016, "train/sim_loss": 0.07421875 }, { "epoch": 0.1004548151077714, "step": 1016, "train/total_loss": 0.1544651985168457 }, { "entropy": 9.633044242858887, "epoch": 0.10055368795728693, "mean_token_accuracy": 0.7380239367485046, "num_tokens": 5597549.0, "step": 1017, "train/ce_loss": 1.1215583086013794 }, { "epoch": 0.10055368795728693, "step": 1017, "train/sim_loss": 0.07421875 }, { "epoch": 0.10055368795728693, "step": 1017, "train/total_loss": 0.18637457489967346 }, { "entropy": 9.18193244934082, "epoch": 0.10065256080680245, "mean_token_accuracy": 0.7041666507720947, "num_tokens": 5603179.0, "step": 1018, "train/ce_loss": 0.5496519804000854 }, { "epoch": 0.10065256080680245, "step": 1018, "train/sim_loss": 0.109375 }, { "epoch": 0.10065256080680245, "step": 1018, "train/total_loss": 0.16434019804000854 }, { "entropy": 9.408222198486328, "epoch": 0.10075143365631797, "mean_token_accuracy": 0.7907894849777222, "num_tokens": 5608565.0, "step": 1019, "train/ce_loss": 0.800451397895813 }, { "epoch": 0.10075143365631797, "step": 1019, "train/sim_loss": 0.125 }, { "epoch": 0.10075143365631797, "step": 1019, "train/total_loss": 0.20504513382911682 }, { "epoch": 0.1008503065058335, "grad_norm": 1.1601839065551758, "learning_rate": 9.75053157296148e-06, "loss": 0.1701, "step": 1020 }, { "entropy": 9.25759506225586, "epoch": 0.1008503065058335, "mean_token_accuracy": 0.6815642714500427, "num_tokens": 5614066.0, "step": 1020, "train/ce_loss": 1.2802094221115112 }, { "epoch": 0.1008503065058335, "step": 1020, "train/sim_loss": 0.109375 }, { "epoch": 0.1008503065058335, "step": 1020, "train/total_loss": 0.23739594221115112 }, { "entropy": 9.016757011413574, "epoch": 0.10094917935534903, "mean_token_accuracy": 0.7556270360946655, "num_tokens": 5619653.0, "step": 1021, "train/ce_loss": 0.8576896786689758 }, { "epoch": 0.10094917935534903, "step": 1021, "train/sim_loss": 0.0546875 }, { "epoch": 0.10094917935534903, "step": 1021, "train/total_loss": 0.14045646786689758 }, { "entropy": 9.493085861206055, "epoch": 0.10104805220486454, "mean_token_accuracy": 0.7589531540870667, "num_tokens": 5624967.0, "step": 1022, "train/ce_loss": 0.8337810635566711 }, { "epoch": 0.10104805220486454, "step": 1022, "train/sim_loss": 0.078125 }, { "epoch": 0.10104805220486454, "step": 1022, "train/total_loss": 0.16150310635566711 }, { "entropy": 9.160560607910156, "epoch": 0.10114692505438007, "mean_token_accuracy": 0.7464788556098938, "num_tokens": 5630472.0, "step": 1023, "train/ce_loss": 0.8104315996170044 }, { "epoch": 0.10114692505438007, "step": 1023, "train/sim_loss": 0.0703125 }, { "epoch": 0.10114692505438007, "step": 1023, "train/total_loss": 0.15135565400123596 }, { "entropy": 9.634056091308594, "epoch": 0.1012457979038956, "mean_token_accuracy": 0.7367576360702515, "num_tokens": 5635708.0, "step": 1024, "train/ce_loss": 0.766697108745575 }, { "epoch": 0.1012457979038956, "step": 1024, "train/sim_loss": 0.1640625 }, { "epoch": 0.1012457979038956, "step": 1024, "train/total_loss": 0.24073222279548645 }, { "entropy": 9.012795448303223, "epoch": 0.10134467075341111, "mean_token_accuracy": 0.7267932295799255, "num_tokens": 5641345.0, "step": 1025, "train/ce_loss": 1.1144509315490723 }, { "epoch": 0.10134467075341111, "step": 1025, "train/sim_loss": 0.19140625 }, { "epoch": 0.10134467075341111, "step": 1025, "train/total_loss": 0.3028513491153717 }, { "entropy": 9.134471893310547, "epoch": 0.10144354360292664, "mean_token_accuracy": 0.7243816256523132, "num_tokens": 5646796.0, "step": 1026, "train/ce_loss": 0.9321521520614624 }, { "epoch": 0.10144354360292664, "step": 1026, "train/sim_loss": 0.0625 }, { "epoch": 0.10144354360292664, "step": 1026, "train/total_loss": 0.1557152271270752 }, { "entropy": 9.13976764678955, "epoch": 0.10154241645244216, "mean_token_accuracy": 0.7706692814826965, "num_tokens": 5652398.0, "step": 1027, "train/ce_loss": 1.427182674407959 }, { "epoch": 0.10154241645244216, "step": 1027, "train/sim_loss": 0.07421875 }, { "epoch": 0.10154241645244216, "step": 1027, "train/total_loss": 0.21693702042102814 }, { "entropy": 9.568248748779297, "epoch": 0.10164128930195768, "mean_token_accuracy": 0.7631184458732605, "num_tokens": 5657641.0, "step": 1028, "train/ce_loss": 0.43901580572128296 }, { "epoch": 0.10164128930195768, "step": 1028, "train/sim_loss": 0.08984375 }, { "epoch": 0.10164128930195768, "step": 1028, "train/total_loss": 0.13374532759189606 }, { "entropy": 8.907251358032227, "epoch": 0.1017401621514732, "mean_token_accuracy": 0.70944744348526, "num_tokens": 5663395.0, "step": 1029, "train/ce_loss": 0.43206658959388733 }, { "epoch": 0.1017401621514732, "step": 1029, "train/sim_loss": 0.07421875 }, { "epoch": 0.1017401621514732, "step": 1029, "train/total_loss": 0.11742541193962097 }, { "entropy": 9.46541976928711, "epoch": 0.10183903500098873, "mean_token_accuracy": 0.760351300239563, "num_tokens": 5669057.0, "step": 1030, "train/ce_loss": 0.9296289682388306 }, { "epoch": 0.10183903500098873, "step": 1030, "train/sim_loss": 0.10546875 }, { "epoch": 0.10183903500098873, "step": 1030, "train/total_loss": 0.19843164086341858 }, { "entropy": 9.109898567199707, "epoch": 0.10193790785050424, "mean_token_accuracy": 0.7874864935874939, "num_tokens": 5674626.0, "step": 1031, "train/ce_loss": 0.305189847946167 }, { "epoch": 0.10193790785050424, "step": 1031, "train/sim_loss": 0.02734375 }, { "epoch": 0.10193790785050424, "step": 1031, "train/total_loss": 0.05786273628473282 }, { "entropy": 8.798907279968262, "epoch": 0.10203678070001977, "mean_token_accuracy": 0.7472118735313416, "num_tokens": 5680391.0, "step": 1032, "train/ce_loss": 0.4736694395542145 }, { "epoch": 0.10203678070001977, "step": 1032, "train/sim_loss": 0.0625 }, { "epoch": 0.10203678070001977, "step": 1032, "train/total_loss": 0.10986694693565369 }, { "entropy": 8.957969665527344, "epoch": 0.1021356535495353, "mean_token_accuracy": 0.7036713361740112, "num_tokens": 5686127.0, "step": 1033, "train/ce_loss": 0.7771794199943542 }, { "epoch": 0.1021356535495353, "step": 1033, "train/sim_loss": 0.0703125 }, { "epoch": 0.1021356535495353, "step": 1033, "train/total_loss": 0.14803044497966766 }, { "entropy": 9.318188667297363, "epoch": 0.10223452639905083, "mean_token_accuracy": 0.7442977428436279, "num_tokens": 5691584.0, "step": 1034, "train/ce_loss": 1.099534034729004 }, { "epoch": 0.10223452639905083, "step": 1034, "train/sim_loss": 0.05859375 }, { "epoch": 0.10223452639905083, "step": 1034, "train/total_loss": 0.1685471534729004 }, { "entropy": 9.311422348022461, "epoch": 0.10233339924856634, "mean_token_accuracy": 0.7403100728988647, "num_tokens": 5696977.0, "step": 1035, "train/ce_loss": 0.7607100009918213 }, { "epoch": 0.10233339924856634, "step": 1035, "train/sim_loss": 0.0625 }, { "epoch": 0.10233339924856634, "step": 1035, "train/total_loss": 0.13857099413871765 }, { "entropy": 8.769672393798828, "epoch": 0.10243227209808187, "mean_token_accuracy": 0.7230125665664673, "num_tokens": 5702748.0, "step": 1036, "train/ce_loss": 1.306992530822754 }, { "epoch": 0.10243227209808187, "step": 1036, "train/sim_loss": 0.08203125 }, { "epoch": 0.10243227209808187, "step": 1036, "train/total_loss": 0.2127305120229721 }, { "entropy": 9.216048240661621, "epoch": 0.1025311449475974, "mean_token_accuracy": 0.7578718662261963, "num_tokens": 5708263.0, "step": 1037, "train/ce_loss": 0.6591010093688965 }, { "epoch": 0.1025311449475974, "step": 1037, "train/sim_loss": 0.03515625 }, { "epoch": 0.1025311449475974, "step": 1037, "train/total_loss": 0.10106635093688965 }, { "entropy": 9.698881149291992, "epoch": 0.10263001779711291, "mean_token_accuracy": 0.7581903338432312, "num_tokens": 5713511.0, "step": 1038, "train/ce_loss": 0.745625913143158 }, { "epoch": 0.10263001779711291, "step": 1038, "train/sim_loss": 0.05859375 }, { "epoch": 0.10263001779711291, "step": 1038, "train/total_loss": 0.13315634429454803 }, { "entropy": 9.01866340637207, "epoch": 0.10272889064662843, "mean_token_accuracy": 0.6631977558135986, "num_tokens": 5719130.0, "step": 1039, "train/ce_loss": 1.8894011974334717 }, { "epoch": 0.10272889064662843, "step": 1039, "train/sim_loss": 0.078125 }, { "epoch": 0.10272889064662843, "step": 1039, "train/total_loss": 0.2670651078224182 }, { "epoch": 0.10282776349614396, "grad_norm": 0.9355204701423645, "learning_rate": 9.745586708203531e-06, "loss": 0.1698, "step": 1040 }, { "entropy": 9.268583297729492, "epoch": 0.10282776349614396, "mean_token_accuracy": 0.6982758641242981, "num_tokens": 5724573.0, "step": 1040, "train/ce_loss": 1.1116206645965576 }, { "epoch": 0.10282776349614396, "step": 1040, "train/sim_loss": 0.09765625 }, { "epoch": 0.10282776349614396, "step": 1040, "train/total_loss": 0.20881831645965576 }, { "entropy": 9.254871368408203, "epoch": 0.10292663634565948, "mean_token_accuracy": 0.7376543283462524, "num_tokens": 5730110.0, "step": 1041, "train/ce_loss": 0.56211256980896 }, { "epoch": 0.10292663634565948, "step": 1041, "train/sim_loss": 0.0859375 }, { "epoch": 0.10292663634565948, "step": 1041, "train/total_loss": 0.14214876294136047 }, { "entropy": 9.327268600463867, "epoch": 0.103025509195175, "mean_token_accuracy": 0.7124563455581665, "num_tokens": 5735578.0, "step": 1042, "train/ce_loss": 0.9053865671157837 }, { "epoch": 0.103025509195175, "step": 1042, "train/sim_loss": 0.12109375 }, { "epoch": 0.103025509195175, "step": 1042, "train/total_loss": 0.2116324007511139 }, { "entropy": 9.082199096679688, "epoch": 0.10312438204469053, "mean_token_accuracy": 0.7385057210922241, "num_tokens": 5741218.0, "step": 1043, "train/ce_loss": 0.9555494785308838 }, { "epoch": 0.10312438204469053, "step": 1043, "train/sim_loss": 0.078125 }, { "epoch": 0.10312438204469053, "step": 1043, "train/total_loss": 0.17367994785308838 }, { "entropy": 9.146942138671875, "epoch": 0.10322325489420606, "mean_token_accuracy": 0.7516340017318726, "num_tokens": 5746748.0, "step": 1044, "train/ce_loss": 0.67552649974823 }, { "epoch": 0.10322325489420606, "step": 1044, "train/sim_loss": 0.1015625 }, { "epoch": 0.10322325489420606, "step": 1044, "train/total_loss": 0.16911515593528748 }, { "entropy": 9.119318008422852, "epoch": 0.10332212774372157, "mean_token_accuracy": 0.7102803587913513, "num_tokens": 5752247.0, "step": 1045, "train/ce_loss": 1.4681214094161987 }, { "epoch": 0.10332212774372157, "step": 1045, "train/sim_loss": 0.125 }, { "epoch": 0.10332212774372157, "step": 1045, "train/total_loss": 0.2718121409416199 }, { "entropy": 9.231773376464844, "epoch": 0.1034210005932371, "mean_token_accuracy": 0.7546584010124207, "num_tokens": 5757839.0, "step": 1046, "train/ce_loss": 0.5723710656166077 }, { "epoch": 0.1034210005932371, "step": 1046, "train/sim_loss": 0.05859375 }, { "epoch": 0.1034210005932371, "step": 1046, "train/total_loss": 0.11583085358142853 }, { "entropy": 9.420903205871582, "epoch": 0.10351987344275262, "mean_token_accuracy": 0.708185076713562, "num_tokens": 5763313.0, "step": 1047, "train/ce_loss": 0.7905470132827759 }, { "epoch": 0.10351987344275262, "step": 1047, "train/sim_loss": 0.09765625 }, { "epoch": 0.10351987344275262, "step": 1047, "train/total_loss": 0.17671096324920654 }, { "entropy": 8.975730895996094, "epoch": 0.10361874629226814, "mean_token_accuracy": 0.7096385359764099, "num_tokens": 5768811.0, "step": 1048, "train/ce_loss": 0.44666633009910583 }, { "epoch": 0.10361874629226814, "step": 1048, "train/sim_loss": 0.08984375 }, { "epoch": 0.10361874629226814, "step": 1048, "train/total_loss": 0.13451038300991058 }, { "entropy": 9.385661125183105, "epoch": 0.10371761914178367, "mean_token_accuracy": 0.7493036389350891, "num_tokens": 5774150.0, "step": 1049, "train/ce_loss": 0.9139134883880615 }, { "epoch": 0.10371761914178367, "step": 1049, "train/sim_loss": 0.08203125 }, { "epoch": 0.10371761914178367, "step": 1049, "train/total_loss": 0.17342260479927063 }, { "entropy": 9.069623947143555, "epoch": 0.10381649199129919, "mean_token_accuracy": 0.754601240158081, "num_tokens": 5779805.0, "step": 1050, "train/ce_loss": 0.7246804237365723 }, { "epoch": 0.10381649199129919, "step": 1050, "train/sim_loss": 0.1015625 }, { "epoch": 0.10381649199129919, "step": 1050, "train/total_loss": 0.17403054237365723 }, { "entropy": 9.350937843322754, "epoch": 0.1039153648408147, "mean_token_accuracy": 0.7090694904327393, "num_tokens": 5785265.0, "step": 1051, "train/ce_loss": 1.2236511707305908 }, { "epoch": 0.1039153648408147, "step": 1051, "train/sim_loss": 0.05859375 }, { "epoch": 0.1039153648408147, "step": 1051, "train/total_loss": 0.18095886707305908 }, { "entropy": 8.949811935424805, "epoch": 0.10401423769033023, "mean_token_accuracy": 0.7618270516395569, "num_tokens": 5791083.0, "step": 1052, "train/ce_loss": 1.9384610652923584 }, { "epoch": 0.10401423769033023, "step": 1052, "train/sim_loss": 0.078125 }, { "epoch": 0.10401423769033023, "step": 1052, "train/total_loss": 0.27197110652923584 }, { "entropy": 9.560361862182617, "epoch": 0.10411311053984576, "mean_token_accuracy": 0.7485380172729492, "num_tokens": 5796349.0, "step": 1053, "train/ce_loss": 0.6756236553192139 }, { "epoch": 0.10411311053984576, "step": 1053, "train/sim_loss": 0.02734375 }, { "epoch": 0.10411311053984576, "step": 1053, "train/total_loss": 0.09490611404180527 }, { "entropy": 9.19154167175293, "epoch": 0.10421198338936129, "mean_token_accuracy": 0.7844374179840088, "num_tokens": 5801952.0, "step": 1054, "train/ce_loss": 0.6377497315406799 }, { "epoch": 0.10421198338936129, "step": 1054, "train/sim_loss": 0.0390625 }, { "epoch": 0.10421198338936129, "step": 1054, "train/total_loss": 0.102837473154068 }, { "entropy": 9.257328987121582, "epoch": 0.1043108562388768, "mean_token_accuracy": 0.7297297120094299, "num_tokens": 5807434.0, "step": 1055, "train/ce_loss": 1.5918647050857544 }, { "epoch": 0.1043108562388768, "step": 1055, "train/sim_loss": 0.0546875 }, { "epoch": 0.1043108562388768, "step": 1055, "train/total_loss": 0.2138739675283432 }, { "entropy": 9.622333526611328, "epoch": 0.10440972908839233, "mean_token_accuracy": 0.7754237055778503, "num_tokens": 5812722.0, "step": 1056, "train/ce_loss": 0.6103330850601196 }, { "epoch": 0.10440972908839233, "step": 1056, "train/sim_loss": 0.0703125 }, { "epoch": 0.10440972908839233, "step": 1056, "train/total_loss": 0.13134580850601196 }, { "entropy": 9.086999893188477, "epoch": 0.10450860193790786, "mean_token_accuracy": 0.7412935495376587, "num_tokens": 5818369.0, "step": 1057, "train/ce_loss": 0.6644096374511719 }, { "epoch": 0.10450860193790786, "step": 1057, "train/sim_loss": 0.140625 }, { "epoch": 0.10450860193790786, "step": 1057, "train/total_loss": 0.20706596970558167 }, { "entropy": 9.12022590637207, "epoch": 0.10460747478742337, "mean_token_accuracy": 0.7340748310089111, "num_tokens": 5823948.0, "step": 1058, "train/ce_loss": 0.5529248118400574 }, { "epoch": 0.10460747478742337, "step": 1058, "train/sim_loss": 0.05078125 }, { "epoch": 0.10460747478742337, "step": 1058, "train/total_loss": 0.10607373714447021 }, { "entropy": 8.9729585647583, "epoch": 0.1047063476369389, "mean_token_accuracy": 0.7264770269393921, "num_tokens": 5829522.0, "step": 1059, "train/ce_loss": 0.829442024230957 }, { "epoch": 0.1047063476369389, "step": 1059, "train/sim_loss": 0.125 }, { "epoch": 0.1047063476369389, "step": 1059, "train/total_loss": 0.20794421434402466 }, { "epoch": 0.10480522048645442, "grad_norm": 0.96333909034729, "learning_rate": 9.740641843445583e-06, "loss": 0.1771, "step": 1060 }, { "entropy": 8.889694213867188, "epoch": 0.10480522048645442, "mean_token_accuracy": 0.7667253613471985, "num_tokens": 5835240.0, "step": 1060, "train/ce_loss": 0.426225483417511 }, { "epoch": 0.10480522048645442, "step": 1060, "train/sim_loss": 0.0390625 }, { "epoch": 0.10480522048645442, "step": 1060, "train/total_loss": 0.08168505132198334 }, { "entropy": 9.254831314086914, "epoch": 0.10490409333596994, "mean_token_accuracy": 0.7552083134651184, "num_tokens": 5840525.0, "step": 1061, "train/ce_loss": 1.0859509706497192 }, { "epoch": 0.10490409333596994, "step": 1061, "train/sim_loss": 0.09375 }, { "epoch": 0.10490409333596994, "step": 1061, "train/total_loss": 0.2023451030254364 }, { "entropy": 9.420211791992188, "epoch": 0.10500296618548546, "mean_token_accuracy": 0.7449101805686951, "num_tokens": 5845990.0, "step": 1062, "train/ce_loss": 0.4255778193473816 }, { "epoch": 0.10500296618548546, "step": 1062, "train/sim_loss": 0.078125 }, { "epoch": 0.10500296618548546, "step": 1062, "train/total_loss": 0.12068278342485428 }, { "entropy": 9.340456008911133, "epoch": 0.10510183903500099, "mean_token_accuracy": 0.7302857041358948, "num_tokens": 5851453.0, "step": 1063, "train/ce_loss": 0.718422532081604 }, { "epoch": 0.10510183903500099, "step": 1063, "train/sim_loss": 0.0859375 }, { "epoch": 0.10510183903500099, "step": 1063, "train/total_loss": 0.1577797532081604 }, { "entropy": 8.53613567352295, "epoch": 0.10520071188451652, "mean_token_accuracy": 0.7377049326896667, "num_tokens": 5857331.0, "step": 1064, "train/ce_loss": 0.999879002571106 }, { "epoch": 0.10520071188451652, "step": 1064, "train/sim_loss": 0.09375 }, { "epoch": 0.10520071188451652, "step": 1064, "train/total_loss": 0.19373789429664612 }, { "entropy": 9.239786148071289, "epoch": 0.10529958473403203, "mean_token_accuracy": 0.727911651134491, "num_tokens": 5862819.0, "step": 1065, "train/ce_loss": 1.9395544528961182 }, { "epoch": 0.10529958473403203, "step": 1065, "train/sim_loss": 0.08203125 }, { "epoch": 0.10529958473403203, "step": 1065, "train/total_loss": 0.2759867012500763 }, { "entropy": 9.407793045043945, "epoch": 0.10539845758354756, "mean_token_accuracy": 0.7354925870895386, "num_tokens": 5868159.0, "step": 1066, "train/ce_loss": 0.9665265679359436 }, { "epoch": 0.10539845758354756, "step": 1066, "train/sim_loss": 0.109375 }, { "epoch": 0.10539845758354756, "step": 1066, "train/total_loss": 0.20602765679359436 }, { "entropy": 9.278482437133789, "epoch": 0.10549733043306309, "mean_token_accuracy": 0.7658157348632812, "num_tokens": 5873664.0, "step": 1067, "train/ce_loss": 1.19568932056427 }, { "epoch": 0.10549733043306309, "step": 1067, "train/sim_loss": 0.09375 }, { "epoch": 0.10549733043306309, "step": 1067, "train/total_loss": 0.21331894397735596 }, { "entropy": 9.429168701171875, "epoch": 0.1055962032825786, "mean_token_accuracy": 0.7160493731498718, "num_tokens": 5879073.0, "step": 1068, "train/ce_loss": 0.7553990483283997 }, { "epoch": 0.1055962032825786, "step": 1068, "train/sim_loss": 0.06640625 }, { "epoch": 0.1055962032825786, "step": 1068, "train/total_loss": 0.14194616675376892 }, { "entropy": 9.360651969909668, "epoch": 0.10569507613209413, "mean_token_accuracy": 0.7348203063011169, "num_tokens": 5884462.0, "step": 1069, "train/ce_loss": 0.47743356227874756 }, { "epoch": 0.10569507613209413, "step": 1069, "train/sim_loss": 0.12109375 }, { "epoch": 0.10569507613209413, "step": 1069, "train/total_loss": 0.16883710026741028 }, { "entropy": 9.173002243041992, "epoch": 0.10579394898160965, "mean_token_accuracy": 0.6958580017089844, "num_tokens": 5889919.0, "step": 1070, "train/ce_loss": 0.9101439714431763 }, { "epoch": 0.10579394898160965, "step": 1070, "train/sim_loss": 0.16796875 }, { "epoch": 0.10579394898160965, "step": 1070, "train/total_loss": 0.25898313522338867 }, { "entropy": 9.034067153930664, "epoch": 0.10589282183112517, "mean_token_accuracy": 0.7034631967544556, "num_tokens": 5895494.0, "step": 1071, "train/ce_loss": 0.7976637482643127 }, { "epoch": 0.10589282183112517, "step": 1071, "train/sim_loss": 0.0859375 }, { "epoch": 0.10589282183112517, "step": 1071, "train/total_loss": 0.1657038778066635 }, { "entropy": 9.316985130310059, "epoch": 0.1059916946806407, "mean_token_accuracy": 0.7039473652839661, "num_tokens": 5900930.0, "step": 1072, "train/ce_loss": 1.1319177150726318 }, { "epoch": 0.1059916946806407, "step": 1072, "train/sim_loss": 0.07421875 }, { "epoch": 0.1059916946806407, "step": 1072, "train/total_loss": 0.18741053342819214 }, { "entropy": 9.129426956176758, "epoch": 0.10609056753015622, "mean_token_accuracy": 0.7817969918251038, "num_tokens": 5906392.0, "step": 1073, "train/ce_loss": 0.4803432524204254 }, { "epoch": 0.10609056753015622, "step": 1073, "train/sim_loss": 0.05078125 }, { "epoch": 0.10609056753015622, "step": 1073, "train/total_loss": 0.09881557524204254 }, { "entropy": 9.5374755859375, "epoch": 0.10618944037967175, "mean_token_accuracy": 0.7191011309623718, "num_tokens": 5911629.0, "step": 1074, "train/ce_loss": 0.9348711371421814 }, { "epoch": 0.10618944037967175, "step": 1074, "train/sim_loss": 0.171875 }, { "epoch": 0.10618944037967175, "step": 1074, "train/total_loss": 0.26536211371421814 }, { "entropy": 9.334661483764648, "epoch": 0.10628831322918726, "mean_token_accuracy": 0.742996335029602, "num_tokens": 5917116.0, "step": 1075, "train/ce_loss": 0.49299538135528564 }, { "epoch": 0.10628831322918726, "step": 1075, "train/sim_loss": 0.08984375 }, { "epoch": 0.10628831322918726, "step": 1075, "train/total_loss": 0.13914328813552856 }, { "entropy": 9.742683410644531, "epoch": 0.10638718607870279, "mean_token_accuracy": 0.7701863646507263, "num_tokens": 5922317.0, "step": 1076, "train/ce_loss": 0.9030827879905701 }, { "epoch": 0.10638718607870279, "step": 1076, "train/sim_loss": 0.0703125 }, { "epoch": 0.10638718607870279, "step": 1076, "train/total_loss": 0.160620778799057 }, { "entropy": 9.388204574584961, "epoch": 0.10648605892821832, "mean_token_accuracy": 0.7261306643486023, "num_tokens": 5927736.0, "step": 1077, "train/ce_loss": 1.402625560760498 }, { "epoch": 0.10648605892821832, "step": 1077, "train/sim_loss": 0.11328125 }, { "epoch": 0.10648605892821832, "step": 1077, "train/total_loss": 0.25354379415512085 }, { "entropy": 9.026885986328125, "epoch": 0.10658493177773383, "mean_token_accuracy": 0.7488788962364197, "num_tokens": 5933439.0, "step": 1078, "train/ce_loss": 1.8214640617370605 }, { "epoch": 0.10658493177773383, "step": 1078, "train/sim_loss": 0.0703125 }, { "epoch": 0.10658493177773383, "step": 1078, "train/total_loss": 0.25245893001556396 }, { "entropy": 9.161922454833984, "epoch": 0.10668380462724936, "mean_token_accuracy": 0.772870659828186, "num_tokens": 5938933.0, "step": 1079, "train/ce_loss": 0.36313918232917786 }, { "epoch": 0.10668380462724936, "step": 1079, "train/sim_loss": 0.0625 }, { "epoch": 0.10668380462724936, "step": 1079, "train/total_loss": 0.09881392121315002 }, { "epoch": 0.10678267747676488, "grad_norm": 0.7525379061698914, "learning_rate": 9.735696978687634e-06, "loss": 0.1644, "step": 1080 }, { "entropy": 9.289451599121094, "epoch": 0.10678267747676488, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 5944271.0, "step": 1080, "train/ce_loss": 0.40398943424224854 }, { "epoch": 0.10678267747676488, "step": 1080, "train/sim_loss": 0.05859375 }, { "epoch": 0.10678267747676488, "step": 1080, "train/total_loss": 0.09899269044399261 }, { "entropy": 9.581172943115234, "epoch": 0.1068815503262804, "mean_token_accuracy": 0.7474600672721863, "num_tokens": 5949596.0, "step": 1081, "train/ce_loss": 0.8552248477935791 }, { "epoch": 0.1068815503262804, "step": 1081, "train/sim_loss": 0.09765625 }, { "epoch": 0.1068815503262804, "step": 1081, "train/total_loss": 0.18317873775959015 }, { "entropy": 9.294159889221191, "epoch": 0.10698042317579592, "mean_token_accuracy": 0.7892376780509949, "num_tokens": 5955065.0, "step": 1082, "train/ce_loss": 0.8220403790473938 }, { "epoch": 0.10698042317579592, "step": 1082, "train/sim_loss": 0.0859375 }, { "epoch": 0.10698042317579592, "step": 1082, "train/total_loss": 0.16814154386520386 }, { "entropy": 9.194902420043945, "epoch": 0.10707929602531145, "mean_token_accuracy": 0.6948052048683167, "num_tokens": 5960631.0, "step": 1083, "train/ce_loss": 1.1920182704925537 }, { "epoch": 0.10707929602531145, "step": 1083, "train/sim_loss": 0.1015625 }, { "epoch": 0.10707929602531145, "step": 1083, "train/total_loss": 0.22076433897018433 }, { "entropy": 9.172277450561523, "epoch": 0.10717816887482698, "mean_token_accuracy": 0.7625979781150818, "num_tokens": 5966158.0, "step": 1084, "train/ce_loss": 0.6881358623504639 }, { "epoch": 0.10717816887482698, "step": 1084, "train/sim_loss": 0.07421875 }, { "epoch": 0.10717816887482698, "step": 1084, "train/total_loss": 0.14303234219551086 }, { "entropy": 9.436239242553711, "epoch": 0.10727704172434249, "mean_token_accuracy": 0.7461077570915222, "num_tokens": 5971571.0, "step": 1085, "train/ce_loss": 1.151771068572998 }, { "epoch": 0.10727704172434249, "step": 1085, "train/sim_loss": 0.09375 }, { "epoch": 0.10727704172434249, "step": 1085, "train/total_loss": 0.20892710983753204 }, { "entropy": 9.089550018310547, "epoch": 0.10737591457385802, "mean_token_accuracy": 0.698888897895813, "num_tokens": 5977088.0, "step": 1086, "train/ce_loss": 0.9254410266876221 }, { "epoch": 0.10737591457385802, "step": 1086, "train/sim_loss": 0.0859375 }, { "epoch": 0.10737591457385802, "step": 1086, "train/total_loss": 0.17848160862922668 }, { "entropy": 9.459351539611816, "epoch": 0.10747478742337355, "mean_token_accuracy": 0.7471410632133484, "num_tokens": 5982450.0, "step": 1087, "train/ce_loss": 0.6312832832336426 }, { "epoch": 0.10747478742337355, "step": 1087, "train/sim_loss": 0.0546875 }, { "epoch": 0.10747478742337355, "step": 1087, "train/total_loss": 0.11781582981348038 }, { "entropy": 9.311142921447754, "epoch": 0.10757366027288906, "mean_token_accuracy": 0.7932692170143127, "num_tokens": 5987897.0, "step": 1088, "train/ce_loss": 0.7982792258262634 }, { "epoch": 0.10757366027288906, "step": 1088, "train/sim_loss": 0.0859375 }, { "epoch": 0.10757366027288906, "step": 1088, "train/total_loss": 0.1657654345035553 }, { "entropy": 9.227863311767578, "epoch": 0.10767253312240459, "mean_token_accuracy": 0.7355889678001404, "num_tokens": 5993340.0, "step": 1089, "train/ce_loss": 0.7457457184791565 }, { "epoch": 0.10767253312240459, "step": 1089, "train/sim_loss": 0.05859375 }, { "epoch": 0.10767253312240459, "step": 1089, "train/total_loss": 0.1331683248281479 }, { "entropy": 9.460197448730469, "epoch": 0.10777140597192011, "mean_token_accuracy": 0.7358490824699402, "num_tokens": 5998629.0, "step": 1090, "train/ce_loss": 0.5751650929450989 }, { "epoch": 0.10777140597192011, "step": 1090, "train/sim_loss": 0.0625 }, { "epoch": 0.10777140597192011, "step": 1090, "train/total_loss": 0.12001651525497437 }, { "entropy": 9.432146072387695, "epoch": 0.10787027882143563, "mean_token_accuracy": 0.7858099341392517, "num_tokens": 6003947.0, "step": 1091, "train/ce_loss": 0.7322900295257568 }, { "epoch": 0.10787027882143563, "step": 1091, "train/sim_loss": 0.078125 }, { "epoch": 0.10787027882143563, "step": 1091, "train/total_loss": 0.15135401487350464 }, { "entropy": 9.36902141571045, "epoch": 0.10796915167095116, "mean_token_accuracy": 0.7676630616188049, "num_tokens": 6009370.0, "step": 1092, "train/ce_loss": 0.7428185939788818 }, { "epoch": 0.10796915167095116, "step": 1092, "train/sim_loss": 0.10546875 }, { "epoch": 0.10796915167095116, "step": 1092, "train/total_loss": 0.17975062131881714 }, { "entropy": 9.355979919433594, "epoch": 0.10806802452046668, "mean_token_accuracy": 0.7134831547737122, "num_tokens": 6014860.0, "step": 1093, "train/ce_loss": 1.0389063358306885 }, { "epoch": 0.10806802452046668, "step": 1093, "train/sim_loss": 0.1171875 }, { "epoch": 0.10806802452046668, "step": 1093, "train/total_loss": 0.22107812762260437 }, { "entropy": 9.416781425476074, "epoch": 0.10816689736998221, "mean_token_accuracy": 0.7399723529815674, "num_tokens": 6020246.0, "step": 1094, "train/ce_loss": 0.9212457537651062 }, { "epoch": 0.10816689736998221, "step": 1094, "train/sim_loss": 0.0859375 }, { "epoch": 0.10816689736998221, "step": 1094, "train/total_loss": 0.1780620813369751 }, { "entropy": 9.524955749511719, "epoch": 0.10826577021949772, "mean_token_accuracy": 0.7089743614196777, "num_tokens": 6025657.0, "step": 1095, "train/ce_loss": 0.491433322429657 }, { "epoch": 0.10826577021949772, "step": 1095, "train/sim_loss": 0.0546875 }, { "epoch": 0.10826577021949772, "step": 1095, "train/total_loss": 0.10383082926273346 }, { "entropy": 9.133634567260742, "epoch": 0.10836464306901325, "mean_token_accuracy": 0.7191011309623718, "num_tokens": 6031120.0, "step": 1096, "train/ce_loss": 0.5859624743461609 }, { "epoch": 0.10836464306901325, "step": 1096, "train/sim_loss": 0.109375 }, { "epoch": 0.10836464306901325, "step": 1096, "train/total_loss": 0.16797125339508057 }, { "entropy": 9.234538078308105, "epoch": 0.10846351591852878, "mean_token_accuracy": 0.7623873949050903, "num_tokens": 6036563.0, "step": 1097, "train/ce_loss": 0.5527185797691345 }, { "epoch": 0.10846351591852878, "step": 1097, "train/sim_loss": 0.0859375 }, { "epoch": 0.10846351591852878, "step": 1097, "train/total_loss": 0.14120936393737793 }, { "entropy": 9.376171112060547, "epoch": 0.10856238876804429, "mean_token_accuracy": 0.8039673566818237, "num_tokens": 6041957.0, "step": 1098, "train/ce_loss": 0.6557193994522095 }, { "epoch": 0.10856238876804429, "step": 1098, "train/sim_loss": 0.0390625 }, { "epoch": 0.10856238876804429, "step": 1098, "train/total_loss": 0.10463444143533707 }, { "entropy": 8.981746673583984, "epoch": 0.10866126161755982, "mean_token_accuracy": 0.6791120171546936, "num_tokens": 6047532.0, "step": 1099, "train/ce_loss": 0.9594023823738098 }, { "epoch": 0.10866126161755982, "step": 1099, "train/sim_loss": 0.08984375 }, { "epoch": 0.10866126161755982, "step": 1099, "train/total_loss": 0.1857839822769165 }, { "epoch": 0.10876013446707535, "grad_norm": 1.0593851804733276, "learning_rate": 9.730752113929684e-06, "loss": 0.1652, "step": 1100 }, { "entropy": 9.122255325317383, "epoch": 0.10876013446707535, "mean_token_accuracy": 0.7008055448532104, "num_tokens": 6053252.0, "step": 1100, "train/ce_loss": 0.7138405442237854 }, { "epoch": 0.10876013446707535, "step": 1100, "train/sim_loss": 0.125 }, { "epoch": 0.10876013446707535, "step": 1100, "train/total_loss": 0.19638405740261078 }, { "entropy": 9.039870262145996, "epoch": 0.10885900731659086, "mean_token_accuracy": 0.732801616191864, "num_tokens": 6058873.0, "step": 1101, "train/ce_loss": 1.4543665647506714 }, { "epoch": 0.10885900731659086, "step": 1101, "train/sim_loss": 0.140625 }, { "epoch": 0.10885900731659086, "step": 1101, "train/total_loss": 0.2860616445541382 }, { "entropy": 9.29890251159668, "epoch": 0.10895788016610639, "mean_token_accuracy": 0.7280799150466919, "num_tokens": 6064358.0, "step": 1102, "train/ce_loss": 0.38181832432746887 }, { "epoch": 0.10895788016610639, "step": 1102, "train/sim_loss": 0.08984375 }, { "epoch": 0.10895788016610639, "step": 1102, "train/total_loss": 0.1280255913734436 }, { "entropy": 8.807498931884766, "epoch": 0.10905675301562191, "mean_token_accuracy": 0.7468776106834412, "num_tokens": 6070125.0, "step": 1103, "train/ce_loss": 1.3944907188415527 }, { "epoch": 0.10905675301562191, "step": 1103, "train/sim_loss": 0.10546875 }, { "epoch": 0.10905675301562191, "step": 1103, "train/total_loss": 0.2449178248643875 }, { "entropy": 9.366559982299805, "epoch": 0.10915562586513744, "mean_token_accuracy": 0.7283163070678711, "num_tokens": 6075537.0, "step": 1104, "train/ce_loss": 0.710670530796051 }, { "epoch": 0.10915562586513744, "step": 1104, "train/sim_loss": 0.06640625 }, { "epoch": 0.10915562586513744, "step": 1104, "train/total_loss": 0.13747331500053406 }, { "entropy": 9.425816535949707, "epoch": 0.10925449871465295, "mean_token_accuracy": 0.7680209875106812, "num_tokens": 6080923.0, "step": 1105, "train/ce_loss": 0.6122625470161438 }, { "epoch": 0.10925449871465295, "step": 1105, "train/sim_loss": 0.08984375 }, { "epoch": 0.10925449871465295, "step": 1105, "train/total_loss": 0.1510699987411499 }, { "entropy": 9.047220230102539, "epoch": 0.10935337156416848, "mean_token_accuracy": 0.7931707501411438, "num_tokens": 6086599.0, "step": 1106, "train/ce_loss": 0.5994169116020203 }, { "epoch": 0.10935337156416848, "step": 1106, "train/sim_loss": 0.0625 }, { "epoch": 0.10935337156416848, "step": 1106, "train/total_loss": 0.12244169414043427 }, { "entropy": 8.871713638305664, "epoch": 0.10945224441368401, "mean_token_accuracy": 0.7260034680366516, "num_tokens": 6092337.0, "step": 1107, "train/ce_loss": 0.6721354126930237 }, { "epoch": 0.10945224441368401, "step": 1107, "train/sim_loss": 0.10546875 }, { "epoch": 0.10945224441368401, "step": 1107, "train/total_loss": 0.1726822853088379 }, { "entropy": 9.317713737487793, "epoch": 0.10955111726319952, "mean_token_accuracy": 0.7392739057540894, "num_tokens": 6097809.0, "step": 1108, "train/ce_loss": 0.9451110363006592 }, { "epoch": 0.10955111726319952, "step": 1108, "train/sim_loss": 0.1328125 }, { "epoch": 0.10955111726319952, "step": 1108, "train/total_loss": 0.22732360661029816 }, { "entropy": 9.228523254394531, "epoch": 0.10964999011271505, "mean_token_accuracy": 0.7737069129943848, "num_tokens": 6103354.0, "step": 1109, "train/ce_loss": 0.6781623959541321 }, { "epoch": 0.10964999011271505, "step": 1109, "train/sim_loss": 0.09375 }, { "epoch": 0.10964999011271505, "step": 1109, "train/total_loss": 0.16156624257564545 }, { "entropy": 9.06474494934082, "epoch": 0.10974886296223058, "mean_token_accuracy": 0.7952840328216553, "num_tokens": 6108954.0, "step": 1110, "train/ce_loss": 0.5285145044326782 }, { "epoch": 0.10974886296223058, "step": 1110, "train/sim_loss": 0.06640625 }, { "epoch": 0.10974886296223058, "step": 1110, "train/total_loss": 0.11925770342350006 }, { "entropy": 9.037266731262207, "epoch": 0.10984773581174609, "mean_token_accuracy": 0.7430340647697449, "num_tokens": 6114593.0, "step": 1111, "train/ce_loss": 0.9984353184700012 }, { "epoch": 0.10984773581174609, "step": 1111, "train/sim_loss": 0.1328125 }, { "epoch": 0.10984773581174609, "step": 1111, "train/total_loss": 0.23265603184700012 }, { "entropy": 9.363048553466797, "epoch": 0.10994660866126162, "mean_token_accuracy": 0.7570469975471497, "num_tokens": 6120034.0, "step": 1112, "train/ce_loss": 1.090525507926941 }, { "epoch": 0.10994660866126162, "step": 1112, "train/sim_loss": 0.1171875 }, { "epoch": 0.10994660866126162, "step": 1112, "train/total_loss": 0.22624005377292633 }, { "entropy": 9.561092376708984, "epoch": 0.11004548151077714, "mean_token_accuracy": 0.6841317415237427, "num_tokens": 6125313.0, "step": 1113, "train/ce_loss": 0.6398656964302063 }, { "epoch": 0.11004548151077714, "step": 1113, "train/sim_loss": 0.109375 }, { "epoch": 0.11004548151077714, "step": 1113, "train/total_loss": 0.17336156964302063 }, { "entropy": 9.275972366333008, "epoch": 0.11014435436029267, "mean_token_accuracy": 0.7843137383460999, "num_tokens": 6130816.0, "step": 1114, "train/ce_loss": 0.5259917378425598 }, { "epoch": 0.11014435436029267, "step": 1114, "train/sim_loss": 0.125 }, { "epoch": 0.11014435436029267, "step": 1114, "train/total_loss": 0.17759917676448822 }, { "entropy": 9.363109588623047, "epoch": 0.11024322720980818, "mean_token_accuracy": 0.7377049326896667, "num_tokens": 6136242.0, "step": 1115, "train/ce_loss": 0.8570628762245178 }, { "epoch": 0.11024322720980818, "step": 1115, "train/sim_loss": 0.078125 }, { "epoch": 0.11024322720980818, "step": 1115, "train/total_loss": 0.16383129358291626 }, { "entropy": 9.49985122680664, "epoch": 0.11034210005932371, "mean_token_accuracy": 0.6914893388748169, "num_tokens": 6141485.0, "step": 1116, "train/ce_loss": 1.1978973150253296 }, { "epoch": 0.11034210005932371, "step": 1116, "train/sim_loss": 0.05078125 }, { "epoch": 0.11034210005932371, "step": 1116, "train/total_loss": 0.1705709844827652 }, { "entropy": 9.526140213012695, "epoch": 0.11044097290883924, "mean_token_accuracy": 0.748641312122345, "num_tokens": 6146805.0, "step": 1117, "train/ce_loss": 0.8492251634597778 }, { "epoch": 0.11044097290883924, "step": 1117, "train/sim_loss": 0.12109375 }, { "epoch": 0.11044097290883924, "step": 1117, "train/total_loss": 0.20601627230644226 }, { "entropy": 8.907689094543457, "epoch": 0.11053984575835475, "mean_token_accuracy": 0.7300509214401245, "num_tokens": 6152811.0, "step": 1118, "train/ce_loss": 1.1296863555908203 }, { "epoch": 0.11053984575835475, "step": 1118, "train/sim_loss": 0.0859375 }, { "epoch": 0.11053984575835475, "step": 1118, "train/total_loss": 0.19890613853931427 }, { "entropy": 8.9551420211792, "epoch": 0.11063871860787028, "mean_token_accuracy": 0.6739864945411682, "num_tokens": 6158537.0, "step": 1119, "train/ce_loss": 1.7867990732192993 }, { "epoch": 0.11063871860787028, "step": 1119, "train/sim_loss": 0.0703125 }, { "epoch": 0.11063871860787028, "step": 1119, "train/total_loss": 0.2489924132823944 }, { "epoch": 0.1107375914573858, "grad_norm": 0.9946242570877075, "learning_rate": 9.725807249171736e-06, "loss": 0.1703, "step": 1120 }, { "entropy": 9.184356689453125, "epoch": 0.1107375914573858, "mean_token_accuracy": 0.7058165669441223, "num_tokens": 6164071.0, "step": 1120, "train/ce_loss": 1.0152487754821777 }, { "epoch": 0.1107375914573858, "step": 1120, "train/sim_loss": 0.16015625 }, { "epoch": 0.1107375914573858, "step": 1120, "train/total_loss": 0.26168113946914673 }, { "entropy": 9.397581100463867, "epoch": 0.11083646430690132, "mean_token_accuracy": 0.8104325532913208, "num_tokens": 6169664.0, "step": 1121, "train/ce_loss": 0.5358200669288635 }, { "epoch": 0.11083646430690132, "step": 1121, "train/sim_loss": 0.09375 }, { "epoch": 0.11083646430690132, "step": 1121, "train/total_loss": 0.14733201265335083 }, { "entropy": 9.098569869995117, "epoch": 0.11093533715641685, "mean_token_accuracy": 0.7453658580780029, "num_tokens": 6175483.0, "step": 1122, "train/ce_loss": 0.777290403842926 }, { "epoch": 0.11093533715641685, "step": 1122, "train/sim_loss": 0.1484375 }, { "epoch": 0.11093533715641685, "step": 1122, "train/total_loss": 0.22616654634475708 }, { "entropy": 9.411334037780762, "epoch": 0.11103421000593237, "mean_token_accuracy": 0.7405602931976318, "num_tokens": 6180941.0, "step": 1123, "train/ce_loss": 1.02171790599823 }, { "epoch": 0.11103421000593237, "step": 1123, "train/sim_loss": 0.0859375 }, { "epoch": 0.11103421000593237, "step": 1123, "train/total_loss": 0.18810929358005524 }, { "entropy": 9.181825637817383, "epoch": 0.11113308285544789, "mean_token_accuracy": 0.7829457521438599, "num_tokens": 6186251.0, "step": 1124, "train/ce_loss": 0.5443146824836731 }, { "epoch": 0.11113308285544789, "step": 1124, "train/sim_loss": 0.0546875 }, { "epoch": 0.11113308285544789, "step": 1124, "train/total_loss": 0.10911896824836731 }, { "entropy": 9.572493553161621, "epoch": 0.11123195570496341, "mean_token_accuracy": 0.7582562565803528, "num_tokens": 6191577.0, "step": 1125, "train/ce_loss": 0.7283871173858643 }, { "epoch": 0.11123195570496341, "step": 1125, "train/sim_loss": 0.05078125 }, { "epoch": 0.11123195570496341, "step": 1125, "train/total_loss": 0.12361996620893478 }, { "entropy": 9.130411148071289, "epoch": 0.11133082855447894, "mean_token_accuracy": 0.7202185988426208, "num_tokens": 6197139.0, "step": 1126, "train/ce_loss": 0.7114206552505493 }, { "epoch": 0.11133082855447894, "step": 1126, "train/sim_loss": 0.109375 }, { "epoch": 0.11133082855447894, "step": 1126, "train/total_loss": 0.1805170774459839 }, { "entropy": 8.979597091674805, "epoch": 0.11142970140399447, "mean_token_accuracy": 0.6722939610481262, "num_tokens": 6202744.0, "step": 1127, "train/ce_loss": 0.8555367588996887 }, { "epoch": 0.11142970140399447, "step": 1127, "train/sim_loss": 0.08984375 }, { "epoch": 0.11142970140399447, "step": 1127, "train/total_loss": 0.17539742588996887 }, { "entropy": 9.394695281982422, "epoch": 0.11152857425350998, "mean_token_accuracy": 0.7343387603759766, "num_tokens": 6208153.0, "step": 1128, "train/ce_loss": 0.6329587697982788 }, { "epoch": 0.11152857425350998, "step": 1128, "train/sim_loss": 0.11328125 }, { "epoch": 0.11152857425350998, "step": 1128, "train/total_loss": 0.1765771210193634 }, { "entropy": 9.151700973510742, "epoch": 0.11162744710302551, "mean_token_accuracy": 0.703544557094574, "num_tokens": 6213742.0, "step": 1129, "train/ce_loss": 1.2386038303375244 }, { "epoch": 0.11162744710302551, "step": 1129, "train/sim_loss": 0.1015625 }, { "epoch": 0.11162744710302551, "step": 1129, "train/total_loss": 0.22542288899421692 }, { "entropy": 9.221321105957031, "epoch": 0.11172631995254104, "mean_token_accuracy": 0.7705521583557129, "num_tokens": 6219220.0, "step": 1130, "train/ce_loss": 0.9198158383369446 }, { "epoch": 0.11172631995254104, "step": 1130, "train/sim_loss": 0.03515625 }, { "epoch": 0.11172631995254104, "step": 1130, "train/total_loss": 0.12713783979415894 }, { "entropy": 8.830556869506836, "epoch": 0.11182519280205655, "mean_token_accuracy": 0.7304127812385559, "num_tokens": 6225193.0, "step": 1131, "train/ce_loss": 0.9341030120849609 }, { "epoch": 0.11182519280205655, "step": 1131, "train/sim_loss": 0.09375 }, { "epoch": 0.11182519280205655, "step": 1131, "train/total_loss": 0.18716031312942505 }, { "entropy": 9.275386810302734, "epoch": 0.11192406565157208, "mean_token_accuracy": 0.7323628067970276, "num_tokens": 6230735.0, "step": 1132, "train/ce_loss": 0.8674541711807251 }, { "epoch": 0.11192406565157208, "step": 1132, "train/sim_loss": 0.09765625 }, { "epoch": 0.11192406565157208, "step": 1132, "train/total_loss": 0.18440166115760803 }, { "entropy": 9.0125732421875, "epoch": 0.1120229385010876, "mean_token_accuracy": 0.7830092310905457, "num_tokens": 6236334.0, "step": 1133, "train/ce_loss": 0.7827396392822266 }, { "epoch": 0.1120229385010876, "step": 1133, "train/sim_loss": 0.0546875 }, { "epoch": 0.1120229385010876, "step": 1133, "train/total_loss": 0.1329614669084549 }, { "entropy": 9.208124160766602, "epoch": 0.11212181135060312, "mean_token_accuracy": 0.7199570536613464, "num_tokens": 6241979.0, "step": 1134, "train/ce_loss": 1.2954823970794678 }, { "epoch": 0.11212181135060312, "step": 1134, "train/sim_loss": 0.109375 }, { "epoch": 0.11212181135060312, "step": 1134, "train/total_loss": 0.23892323672771454 }, { "entropy": 9.406225204467773, "epoch": 0.11222068420011864, "mean_token_accuracy": 0.7344300746917725, "num_tokens": 6247312.0, "step": 1135, "train/ce_loss": 0.5257598757743835 }, { "epoch": 0.11222068420011864, "step": 1135, "train/sim_loss": 0.03125 }, { "epoch": 0.11222068420011864, "step": 1135, "train/total_loss": 0.0838259905576706 }, { "entropy": 9.062701225280762, "epoch": 0.11231955704963417, "mean_token_accuracy": 0.7253731489181519, "num_tokens": 6252974.0, "step": 1136, "train/ce_loss": 0.740999698638916 }, { "epoch": 0.11231955704963417, "step": 1136, "train/sim_loss": 0.05078125 }, { "epoch": 0.11231955704963417, "step": 1136, "train/total_loss": 0.12488122284412384 }, { "entropy": 9.3583984375, "epoch": 0.1124184298991497, "mean_token_accuracy": 0.7281213402748108, "num_tokens": 6258474.0, "step": 1137, "train/ce_loss": 1.3554621934890747 }, { "epoch": 0.1124184298991497, "step": 1137, "train/sim_loss": 0.09375 }, { "epoch": 0.1124184298991497, "step": 1137, "train/total_loss": 0.2292962223291397 }, { "entropy": 9.512561798095703, "epoch": 0.11251730274866521, "mean_token_accuracy": 0.7291414737701416, "num_tokens": 6264066.0, "step": 1138, "train/ce_loss": 1.4475321769714355 }, { "epoch": 0.11251730274866521, "step": 1138, "train/sim_loss": 0.0703125 }, { "epoch": 0.11251730274866521, "step": 1138, "train/total_loss": 0.21506571769714355 }, { "entropy": 9.204137802124023, "epoch": 0.11261617559818074, "mean_token_accuracy": 0.7377451062202454, "num_tokens": 6269501.0, "step": 1139, "train/ce_loss": 0.6939266324043274 }, { "epoch": 0.11261617559818074, "step": 1139, "train/sim_loss": 0.1640625 }, { "epoch": 0.11261617559818074, "step": 1139, "train/total_loss": 0.23345516622066498 }, { "epoch": 0.11271504844769627, "grad_norm": 0.9239275455474854, "learning_rate": 9.720862384413787e-06, "loss": 0.1747, "step": 1140 }, { "entropy": 9.35012435913086, "epoch": 0.11271504844769627, "mean_token_accuracy": 0.7707838416099548, "num_tokens": 6274948.0, "step": 1140, "train/ce_loss": 0.6853241920471191 }, { "epoch": 0.11271504844769627, "step": 1140, "train/sim_loss": 0.03125 }, { "epoch": 0.11271504844769627, "step": 1140, "train/total_loss": 0.09978242218494415 }, { "entropy": 9.271203994750977, "epoch": 0.11281392129721178, "mean_token_accuracy": 0.7131336331367493, "num_tokens": 6280352.0, "step": 1141, "train/ce_loss": 1.3459912538528442 }, { "epoch": 0.11281392129721178, "step": 1141, "train/sim_loss": 0.09765625 }, { "epoch": 0.11281392129721178, "step": 1141, "train/total_loss": 0.23225538432598114 }, { "entropy": 9.550921440124512, "epoch": 0.11291279414672731, "mean_token_accuracy": 0.7516960501670837, "num_tokens": 6285694.0, "step": 1142, "train/ce_loss": 0.7221745848655701 }, { "epoch": 0.11291279414672731, "step": 1142, "train/sim_loss": 0.09765625 }, { "epoch": 0.11291279414672731, "step": 1142, "train/total_loss": 0.16987371444702148 }, { "entropy": 9.234350204467773, "epoch": 0.11301166699624283, "mean_token_accuracy": 0.6993789076805115, "num_tokens": 6291086.0, "step": 1143, "train/ce_loss": 1.13334321975708 }, { "epoch": 0.11301166699624283, "step": 1143, "train/sim_loss": 0.140625 }, { "epoch": 0.11301166699624283, "step": 1143, "train/total_loss": 0.2539593279361725 }, { "entropy": 8.804132461547852, "epoch": 0.11311053984575835, "mean_token_accuracy": 0.7387068271636963, "num_tokens": 6296903.0, "step": 1144, "train/ce_loss": 0.4301456809043884 }, { "epoch": 0.11311053984575835, "step": 1144, "train/sim_loss": 0.078125 }, { "epoch": 0.11311053984575835, "step": 1144, "train/total_loss": 0.12113957107067108 }, { "entropy": 9.367783546447754, "epoch": 0.11320941269527388, "mean_token_accuracy": 0.7459016442298889, "num_tokens": 6302522.0, "step": 1145, "train/ce_loss": 1.1335062980651855 }, { "epoch": 0.11320941269527388, "step": 1145, "train/sim_loss": 0.1015625 }, { "epoch": 0.11320941269527388, "step": 1145, "train/total_loss": 0.21491312980651855 }, { "entropy": 8.845651626586914, "epoch": 0.1133082855447894, "mean_token_accuracy": 0.6948997974395752, "num_tokens": 6308231.0, "step": 1146, "train/ce_loss": 1.9742225408554077 }, { "epoch": 0.1133082855447894, "step": 1146, "train/sim_loss": 0.08203125 }, { "epoch": 0.1133082855447894, "step": 1146, "train/total_loss": 0.2794535160064697 }, { "entropy": 8.96423625946045, "epoch": 0.11340715839430493, "mean_token_accuracy": 0.6906552314758301, "num_tokens": 6313809.0, "step": 1147, "train/ce_loss": 1.1339006423950195 }, { "epoch": 0.11340715839430493, "step": 1147, "train/sim_loss": 0.10546875 }, { "epoch": 0.11340715839430493, "step": 1147, "train/total_loss": 0.21885880827903748 }, { "entropy": 9.091513633728027, "epoch": 0.11350603124382044, "mean_token_accuracy": 0.7472661137580872, "num_tokens": 6319323.0, "step": 1148, "train/ce_loss": 0.5906043648719788 }, { "epoch": 0.11350603124382044, "step": 1148, "train/sim_loss": 0.0859375 }, { "epoch": 0.11350603124382044, "step": 1148, "train/total_loss": 0.14499793946743011 }, { "entropy": 9.122335433959961, "epoch": 0.11360490409333597, "mean_token_accuracy": 0.7751412391662598, "num_tokens": 6324812.0, "step": 1149, "train/ce_loss": 0.6918571591377258 }, { "epoch": 0.11360490409333597, "step": 1149, "train/sim_loss": 0.046875 }, { "epoch": 0.11360490409333597, "step": 1149, "train/total_loss": 0.11606071889400482 }, { "entropy": 9.375894546508789, "epoch": 0.1137037769428515, "mean_token_accuracy": 0.7008547186851501, "num_tokens": 6330155.0, "step": 1150, "train/ce_loss": 0.7957652807235718 }, { "epoch": 0.1137037769428515, "step": 1150, "train/sim_loss": 0.07421875 }, { "epoch": 0.1137037769428515, "step": 1150, "train/total_loss": 0.1537952721118927 }, { "entropy": 8.997232437133789, "epoch": 0.11380264979236701, "mean_token_accuracy": 0.7961722612380981, "num_tokens": 6335826.0, "step": 1151, "train/ce_loss": 0.599818229675293 }, { "epoch": 0.11380264979236701, "step": 1151, "train/sim_loss": 0.078125 }, { "epoch": 0.11380264979236701, "step": 1151, "train/total_loss": 0.1381068229675293 }, { "entropy": 9.224239349365234, "epoch": 0.11390152264188254, "mean_token_accuracy": 0.7751256227493286, "num_tokens": 6341208.0, "step": 1152, "train/ce_loss": 0.7777975797653198 }, { "epoch": 0.11390152264188254, "step": 1152, "train/sim_loss": 0.11328125 }, { "epoch": 0.11390152264188254, "step": 1152, "train/total_loss": 0.19106101989746094 }, { "entropy": 9.26906967163086, "epoch": 0.11400039549139807, "mean_token_accuracy": 0.7825537323951721, "num_tokens": 6346634.0, "step": 1153, "train/ce_loss": 0.8140298128128052 }, { "epoch": 0.11400039549139807, "step": 1153, "train/sim_loss": 0.0625 }, { "epoch": 0.11400039549139807, "step": 1153, "train/total_loss": 0.143902987241745 }, { "entropy": 9.566508293151855, "epoch": 0.11409926834091358, "mean_token_accuracy": 0.7425742745399475, "num_tokens": 6351884.0, "step": 1154, "train/ce_loss": 0.49894341826438904 }, { "epoch": 0.11409926834091358, "step": 1154, "train/sim_loss": 0.0703125 }, { "epoch": 0.11409926834091358, "step": 1154, "train/total_loss": 0.12020684778690338 }, { "entropy": 9.181291580200195, "epoch": 0.1141981411904291, "mean_token_accuracy": 0.7340425252914429, "num_tokens": 6357397.0, "step": 1155, "train/ce_loss": 1.1664700508117676 }, { "epoch": 0.1141981411904291, "step": 1155, "train/sim_loss": 0.12109375 }, { "epoch": 0.1141981411904291, "step": 1155, "train/total_loss": 0.23774075508117676 }, { "entropy": 8.632823944091797, "epoch": 0.11429701403994463, "mean_token_accuracy": 0.758230447769165, "num_tokens": 6363019.0, "step": 1156, "train/ce_loss": 0.6911745667457581 }, { "epoch": 0.11429701403994463, "step": 1156, "train/sim_loss": 0.12109375 }, { "epoch": 0.11429701403994463, "step": 1156, "train/total_loss": 0.1902112066745758 }, { "entropy": 9.24258804321289, "epoch": 0.11439588688946016, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 6368430.0, "step": 1157, "train/ce_loss": 1.4837995767593384 }, { "epoch": 0.11439588688946016, "step": 1157, "train/sim_loss": 0.1015625 }, { "epoch": 0.11439588688946016, "step": 1157, "train/total_loss": 0.24994246661663055 }, { "entropy": 9.011066436767578, "epoch": 0.11449475973897567, "mean_token_accuracy": 0.7651515007019043, "num_tokens": 6373976.0, "step": 1158, "train/ce_loss": 0.718772828578949 }, { "epoch": 0.11449475973897567, "step": 1158, "train/sim_loss": 0.0703125 }, { "epoch": 0.11449475973897567, "step": 1158, "train/total_loss": 0.14218978583812714 }, { "entropy": 9.014022827148438, "epoch": 0.1145936325884912, "mean_token_accuracy": 0.7404426336288452, "num_tokens": 6379618.0, "step": 1159, "train/ce_loss": 0.7627533674240112 }, { "epoch": 0.1145936325884912, "step": 1159, "train/sim_loss": 0.07421875 }, { "epoch": 0.1145936325884912, "step": 1159, "train/total_loss": 0.15049409866333008 }, { "epoch": 0.11469250543800673, "grad_norm": 1.0119487047195435, "learning_rate": 9.715917519655839e-06, "loss": 0.1663, "step": 1160 }, { "entropy": 9.506391525268555, "epoch": 0.11469250543800673, "mean_token_accuracy": 0.7630208134651184, "num_tokens": 6384981.0, "step": 1160, "train/ce_loss": 0.74371737241745 }, { "epoch": 0.11469250543800673, "step": 1160, "train/sim_loss": 0.06640625 }, { "epoch": 0.11469250543800673, "step": 1160, "train/total_loss": 0.14077799022197723 }, { "entropy": 8.781597137451172, "epoch": 0.11479137828752224, "mean_token_accuracy": 0.6546830534934998, "num_tokens": 6390666.0, "step": 1161, "train/ce_loss": 0.4679122567176819 }, { "epoch": 0.11479137828752224, "step": 1161, "train/sim_loss": 0.11328125 }, { "epoch": 0.11479137828752224, "step": 1161, "train/total_loss": 0.1600724756717682 }, { "entropy": 9.189435005187988, "epoch": 0.11489025113703777, "mean_token_accuracy": 0.7759009003639221, "num_tokens": 6396203.0, "step": 1162, "train/ce_loss": 0.674059271812439 }, { "epoch": 0.11489025113703777, "step": 1162, "train/sim_loss": 0.125 }, { "epoch": 0.11489025113703777, "step": 1162, "train/total_loss": 0.19240593910217285 }, { "entropy": 9.348089218139648, "epoch": 0.1149891239865533, "mean_token_accuracy": 0.7124711275100708, "num_tokens": 6401621.0, "step": 1163, "train/ce_loss": 0.7656547427177429 }, { "epoch": 0.1149891239865533, "step": 1163, "train/sim_loss": 0.07421875 }, { "epoch": 0.1149891239865533, "step": 1163, "train/total_loss": 0.1507842242717743 }, { "entropy": 9.384284973144531, "epoch": 0.11508799683606881, "mean_token_accuracy": 0.6993464231491089, "num_tokens": 6406944.0, "step": 1164, "train/ce_loss": 0.6949051022529602 }, { "epoch": 0.11508799683606881, "step": 1164, "train/sim_loss": 0.0390625 }, { "epoch": 0.11508799683606881, "step": 1164, "train/total_loss": 0.10855301469564438 }, { "entropy": 9.502967834472656, "epoch": 0.11518686968558434, "mean_token_accuracy": 0.6993197202682495, "num_tokens": 6412277.0, "step": 1165, "train/ce_loss": 0.836839497089386 }, { "epoch": 0.11518686968558434, "step": 1165, "train/sim_loss": 0.0859375 }, { "epoch": 0.11518686968558434, "step": 1165, "train/total_loss": 0.16962145268917084 }, { "entropy": 9.108278274536133, "epoch": 0.11528574253509986, "mean_token_accuracy": 0.7597700953483582, "num_tokens": 6417809.0, "step": 1166, "train/ce_loss": 1.052413821220398 }, { "epoch": 0.11528574253509986, "step": 1166, "train/sim_loss": 0.1015625 }, { "epoch": 0.11528574253509986, "step": 1166, "train/total_loss": 0.20680388808250427 }, { "entropy": 9.509326934814453, "epoch": 0.11538461538461539, "mean_token_accuracy": 0.7126865386962891, "num_tokens": 6423163.0, "step": 1167, "train/ce_loss": 0.8321999311447144 }, { "epoch": 0.11538461538461539, "step": 1167, "train/sim_loss": 0.0546875 }, { "epoch": 0.11538461538461539, "step": 1167, "train/total_loss": 0.1379075050354004 }, { "entropy": 9.304607391357422, "epoch": 0.1154834882341309, "mean_token_accuracy": 0.7457420825958252, "num_tokens": 6428595.0, "step": 1168, "train/ce_loss": 0.7342486381530762 }, { "epoch": 0.1154834882341309, "step": 1168, "train/sim_loss": 0.0859375 }, { "epoch": 0.1154834882341309, "step": 1168, "train/total_loss": 0.15936237573623657 }, { "entropy": 9.309526443481445, "epoch": 0.11558236108364643, "mean_token_accuracy": 0.7740046977996826, "num_tokens": 6434065.0, "step": 1169, "train/ce_loss": 0.7391464710235596 }, { "epoch": 0.11558236108364643, "step": 1169, "train/sim_loss": 0.08984375 }, { "epoch": 0.11558236108364643, "step": 1169, "train/total_loss": 0.16375839710235596 }, { "entropy": 9.285765647888184, "epoch": 0.11568123393316196, "mean_token_accuracy": 0.7051281929016113, "num_tokens": 6439464.0, "step": 1170, "train/ce_loss": 0.8468372821807861 }, { "epoch": 0.11568123393316196, "step": 1170, "train/sim_loss": 0.0625 }, { "epoch": 0.11568123393316196, "step": 1170, "train/total_loss": 0.14718373119831085 }, { "entropy": 9.310624122619629, "epoch": 0.11578010678267747, "mean_token_accuracy": 0.7749999761581421, "num_tokens": 6444888.0, "step": 1171, "train/ce_loss": 1.047308325767517 }, { "epoch": 0.11578010678267747, "step": 1171, "train/sim_loss": 0.0625 }, { "epoch": 0.11578010678267747, "step": 1171, "train/total_loss": 0.16723084449768066 }, { "entropy": 9.497734069824219, "epoch": 0.115878979632193, "mean_token_accuracy": 0.7480719685554504, "num_tokens": 6450209.0, "step": 1172, "train/ce_loss": 0.7057614326477051 }, { "epoch": 0.115878979632193, "step": 1172, "train/sim_loss": 0.03515625 }, { "epoch": 0.115878979632193, "step": 1172, "train/total_loss": 0.10573239624500275 }, { "entropy": 9.28608512878418, "epoch": 0.11597785248170853, "mean_token_accuracy": 0.7743362784385681, "num_tokens": 6455723.0, "step": 1173, "train/ce_loss": 0.9728030562400818 }, { "epoch": 0.11597785248170853, "step": 1173, "train/sim_loss": 0.15234375 }, { "epoch": 0.11597785248170853, "step": 1173, "train/total_loss": 0.24962405860424042 }, { "entropy": 9.20595645904541, "epoch": 0.11607672533122404, "mean_token_accuracy": 0.7221609950065613, "num_tokens": 6461293.0, "step": 1174, "train/ce_loss": 0.8270496726036072 }, { "epoch": 0.11607672533122404, "step": 1174, "train/sim_loss": 0.125 }, { "epoch": 0.11607672533122404, "step": 1174, "train/total_loss": 0.20770496129989624 }, { "entropy": 9.098197937011719, "epoch": 0.11617559818073957, "mean_token_accuracy": 0.6694825887680054, "num_tokens": 6466805.0, "step": 1175, "train/ce_loss": 1.4707664251327515 }, { "epoch": 0.11617559818073957, "step": 1175, "train/sim_loss": 0.12109375 }, { "epoch": 0.11617559818073957, "step": 1175, "train/total_loss": 0.26817041635513306 }, { "entropy": 9.14116096496582, "epoch": 0.1162744710302551, "mean_token_accuracy": 0.7051962018013, "num_tokens": 6472318.0, "step": 1176, "train/ce_loss": 0.521716296672821 }, { "epoch": 0.1162744710302551, "step": 1176, "train/sim_loss": 0.0390625 }, { "epoch": 0.1162744710302551, "step": 1176, "train/total_loss": 0.09123413264751434 }, { "entropy": 8.934706687927246, "epoch": 0.11637334387977062, "mean_token_accuracy": 0.7661691308021545, "num_tokens": 6477970.0, "step": 1177, "train/ce_loss": 0.9879838824272156 }, { "epoch": 0.11637334387977062, "step": 1177, "train/sim_loss": 0.10546875 }, { "epoch": 0.11637334387977062, "step": 1177, "train/total_loss": 0.20426714420318604 }, { "entropy": 9.403409957885742, "epoch": 0.11647221672928613, "mean_token_accuracy": 0.803170382976532, "num_tokens": 6483356.0, "step": 1178, "train/ce_loss": 0.5339148044586182 }, { "epoch": 0.11647221672928613, "step": 1178, "train/sim_loss": 0.1015625 }, { "epoch": 0.11647221672928613, "step": 1178, "train/total_loss": 0.1549539864063263 }, { "entropy": 9.511190414428711, "epoch": 0.11657108957880166, "mean_token_accuracy": 0.7276536226272583, "num_tokens": 6488674.0, "step": 1179, "train/ce_loss": 0.803650438785553 }, { "epoch": 0.11657108957880166, "step": 1179, "train/sim_loss": 0.0625 }, { "epoch": 0.11657108957880166, "step": 1179, "train/total_loss": 0.14286504685878754 }, { "epoch": 0.11666996242831719, "grad_norm": 1.0589808225631714, "learning_rate": 9.71097265489789e-06, "loss": 0.1659, "step": 1180 }, { "entropy": 9.055192947387695, "epoch": 0.11666996242831719, "mean_token_accuracy": 0.7118181586265564, "num_tokens": 6494358.0, "step": 1180, "train/ce_loss": 0.5763972997665405 }, { "epoch": 0.11666996242831719, "step": 1180, "train/sim_loss": 0.109375 }, { "epoch": 0.11666996242831719, "step": 1180, "train/total_loss": 0.1670147329568863 }, { "entropy": 9.254077911376953, "epoch": 0.1167688352778327, "mean_token_accuracy": 0.7295373678207397, "num_tokens": 6499759.0, "step": 1181, "train/ce_loss": 0.470821350812912 }, { "epoch": 0.1167688352778327, "step": 1181, "train/sim_loss": 0.125 }, { "epoch": 0.1167688352778327, "step": 1181, "train/total_loss": 0.17208214104175568 }, { "entropy": 9.127437591552734, "epoch": 0.11686770812734823, "mean_token_accuracy": 0.7610526084899902, "num_tokens": 6505318.0, "step": 1182, "train/ce_loss": 0.8757054209709167 }, { "epoch": 0.11686770812734823, "step": 1182, "train/sim_loss": 0.046875 }, { "epoch": 0.11686770812734823, "step": 1182, "train/total_loss": 0.13444554805755615 }, { "entropy": 9.33857250213623, "epoch": 0.11696658097686376, "mean_token_accuracy": 0.7261613607406616, "num_tokens": 6510776.0, "step": 1183, "train/ce_loss": 0.8126062750816345 }, { "epoch": 0.11696658097686376, "step": 1183, "train/sim_loss": 0.05078125 }, { "epoch": 0.11696658097686376, "step": 1183, "train/total_loss": 0.13204187154769897 }, { "entropy": 9.18802261352539, "epoch": 0.11706545382637927, "mean_token_accuracy": 0.7268950939178467, "num_tokens": 6516319.0, "step": 1184, "train/ce_loss": 0.7684488892555237 }, { "epoch": 0.11706545382637927, "step": 1184, "train/sim_loss": 0.1484375 }, { "epoch": 0.11706545382637927, "step": 1184, "train/total_loss": 0.22528240084648132 }, { "entropy": 9.248311996459961, "epoch": 0.1171643266758948, "mean_token_accuracy": 0.7292134761810303, "num_tokens": 6521839.0, "step": 1185, "train/ce_loss": 0.5157222151756287 }, { "epoch": 0.1171643266758948, "step": 1185, "train/sim_loss": 0.07421875 }, { "epoch": 0.1171643266758948, "step": 1185, "train/total_loss": 0.12579096853733063 }, { "entropy": 9.330893516540527, "epoch": 0.11726319952541032, "mean_token_accuracy": 0.7657004594802856, "num_tokens": 6527185.0, "step": 1186, "train/ce_loss": 0.9020752906799316 }, { "epoch": 0.11726319952541032, "step": 1186, "train/sim_loss": 0.0703125 }, { "epoch": 0.11726319952541032, "step": 1186, "train/total_loss": 0.1605200320482254 }, { "entropy": 9.295162200927734, "epoch": 0.11736207237492585, "mean_token_accuracy": 0.7796407341957092, "num_tokens": 6532619.0, "step": 1187, "train/ce_loss": 0.8549613952636719 }, { "epoch": 0.11736207237492585, "step": 1187, "train/sim_loss": 0.05078125 }, { "epoch": 0.11736207237492585, "step": 1187, "train/total_loss": 0.13627739250659943 }, { "entropy": 9.27976131439209, "epoch": 0.11746094522444137, "mean_token_accuracy": 0.7219873070716858, "num_tokens": 6538157.0, "step": 1188, "train/ce_loss": 1.7689892053604126 }, { "epoch": 0.11746094522444137, "step": 1188, "train/sim_loss": 0.1484375 }, { "epoch": 0.11746094522444137, "step": 1188, "train/total_loss": 0.32533642649650574 }, { "entropy": 9.259166717529297, "epoch": 0.11755981807395689, "mean_token_accuracy": 0.7144408226013184, "num_tokens": 6543588.0, "step": 1189, "train/ce_loss": 0.6413969397544861 }, { "epoch": 0.11755981807395689, "step": 1189, "train/sim_loss": 0.0546875 }, { "epoch": 0.11755981807395689, "step": 1189, "train/total_loss": 0.11882719397544861 }, { "entropy": 9.518424987792969, "epoch": 0.11765869092347242, "mean_token_accuracy": 0.7607089877128601, "num_tokens": 6548917.0, "step": 1190, "train/ce_loss": 1.3845802545547485 }, { "epoch": 0.11765869092347242, "step": 1190, "train/sim_loss": 0.11328125 }, { "epoch": 0.11765869092347242, "step": 1190, "train/total_loss": 0.2517392635345459 }, { "entropy": 9.3843412399292, "epoch": 0.11775756377298793, "mean_token_accuracy": 0.7027027010917664, "num_tokens": 6554326.0, "step": 1191, "train/ce_loss": 0.8504999876022339 }, { "epoch": 0.11775756377298793, "step": 1191, "train/sim_loss": 0.12890625 }, { "epoch": 0.11775756377298793, "step": 1191, "train/total_loss": 0.21395625174045563 }, { "entropy": 9.142436027526855, "epoch": 0.11785643662250346, "mean_token_accuracy": 0.7598608136177063, "num_tokens": 6559794.0, "step": 1192, "train/ce_loss": 0.7565531134605408 }, { "epoch": 0.11785643662250346, "step": 1192, "train/sim_loss": 0.09765625 }, { "epoch": 0.11785643662250346, "step": 1192, "train/total_loss": 0.17331156134605408 }, { "entropy": 9.008842468261719, "epoch": 0.11795530947201899, "mean_token_accuracy": 0.7053388357162476, "num_tokens": 6565352.0, "step": 1193, "train/ce_loss": 1.1769413948059082 }, { "epoch": 0.11795530947201899, "step": 1193, "train/sim_loss": 0.06640625 }, { "epoch": 0.11795530947201899, "step": 1193, "train/total_loss": 0.18410038948059082 }, { "entropy": 9.020503997802734, "epoch": 0.1180541823215345, "mean_token_accuracy": 0.8260437250137329, "num_tokens": 6570993.0, "step": 1194, "train/ce_loss": 0.6715536117553711 }, { "epoch": 0.1180541823215345, "step": 1194, "train/sim_loss": 0.0859375 }, { "epoch": 0.1180541823215345, "step": 1194, "train/total_loss": 0.1530928611755371 }, { "entropy": 9.140643119812012, "epoch": 0.11815305517105003, "mean_token_accuracy": 0.7901740074157715, "num_tokens": 6576536.0, "step": 1195, "train/ce_loss": 0.8337215185165405 }, { "epoch": 0.11815305517105003, "step": 1195, "train/sim_loss": 0.08203125 }, { "epoch": 0.11815305517105003, "step": 1195, "train/total_loss": 0.16540339589118958 }, { "entropy": 9.246603965759277, "epoch": 0.11825192802056556, "mean_token_accuracy": 0.7370242476463318, "num_tokens": 6581996.0, "step": 1196, "train/ce_loss": 0.8181374669075012 }, { "epoch": 0.11825192802056556, "step": 1196, "train/sim_loss": 0.02734375 }, { "epoch": 0.11825192802056556, "step": 1196, "train/total_loss": 0.109157495200634 }, { "entropy": 8.994644165039062, "epoch": 0.11835080087008108, "mean_token_accuracy": 0.7669094800949097, "num_tokens": 6587601.0, "step": 1197, "train/ce_loss": 0.5358651280403137 }, { "epoch": 0.11835080087008108, "step": 1197, "train/sim_loss": 0.0859375 }, { "epoch": 0.11835080087008108, "step": 1197, "train/total_loss": 0.13952401280403137 }, { "entropy": 9.007833480834961, "epoch": 0.1184496737195966, "mean_token_accuracy": 0.7220602631568909, "num_tokens": 6593228.0, "step": 1198, "train/ce_loss": 1.1440774202346802 }, { "epoch": 0.1184496737195966, "step": 1198, "train/sim_loss": 0.12109375 }, { "epoch": 0.1184496737195966, "step": 1198, "train/total_loss": 0.2355014979839325 }, { "entropy": 9.024446487426758, "epoch": 0.11854854656911212, "mean_token_accuracy": 0.7454545497894287, "num_tokens": 6598814.0, "step": 1199, "train/ce_loss": 0.5733365416526794 }, { "epoch": 0.11854854656911212, "step": 1199, "train/sim_loss": 0.05859375 }, { "epoch": 0.11854854656911212, "step": 1199, "train/total_loss": 0.11592740565538406 }, { "epoch": 0.11864741941862765, "grad_norm": 1.1151913404464722, "learning_rate": 9.70602779013994e-06, "loss": 0.164, "step": 1200 }, { "entropy": 9.104966163635254, "epoch": 0.11864741941862765, "mean_token_accuracy": 0.7766599655151367, "num_tokens": 6604332.0, "step": 1200, "train/ce_loss": 0.3799320161342621 }, { "epoch": 0.11864741941862765, "step": 1200, "train/sim_loss": 0.02734375 }, { "epoch": 0.11864741941862765, "step": 1200, "train/total_loss": 0.06533695757389069 }, { "entropy": 9.173948287963867, "epoch": 0.11874629226814316, "mean_token_accuracy": 0.7616407871246338, "num_tokens": 6609843.0, "step": 1201, "train/ce_loss": 0.8502901196479797 }, { "epoch": 0.11874629226814316, "step": 1201, "train/sim_loss": 0.109375 }, { "epoch": 0.11874629226814316, "step": 1201, "train/total_loss": 0.1944040060043335 }, { "entropy": 9.449905395507812, "epoch": 0.11884516511765869, "mean_token_accuracy": 0.7289473414421082, "num_tokens": 6615204.0, "step": 1202, "train/ce_loss": 0.6444717645645142 }, { "epoch": 0.11884516511765869, "step": 1202, "train/sim_loss": 0.03515625 }, { "epoch": 0.11884516511765869, "step": 1202, "train/total_loss": 0.09960342943668365 }, { "entropy": 9.139521598815918, "epoch": 0.11894403796717422, "mean_token_accuracy": 0.7622448801994324, "num_tokens": 6620817.0, "step": 1203, "train/ce_loss": 0.5059130787849426 }, { "epoch": 0.11894403796717422, "step": 1203, "train/sim_loss": 0.109375 }, { "epoch": 0.11894403796717422, "step": 1203, "train/total_loss": 0.15996630489826202 }, { "entropy": 9.346738815307617, "epoch": 0.11904291081668973, "mean_token_accuracy": 0.761904776096344, "num_tokens": 6626212.0, "step": 1204, "train/ce_loss": 0.6965883374214172 }, { "epoch": 0.11904291081668973, "step": 1204, "train/sim_loss": 0.06640625 }, { "epoch": 0.11904291081668973, "step": 1204, "train/total_loss": 0.13606509566307068 }, { "entropy": 8.796134948730469, "epoch": 0.11914178366620526, "mean_token_accuracy": 0.7249796390533447, "num_tokens": 6632039.0, "step": 1205, "train/ce_loss": 0.4703652858734131 }, { "epoch": 0.11914178366620526, "step": 1205, "train/sim_loss": 0.08984375 }, { "epoch": 0.11914178366620526, "step": 1205, "train/total_loss": 0.1368802785873413 }, { "entropy": 9.062372207641602, "epoch": 0.11924065651572079, "mean_token_accuracy": 0.7338804006576538, "num_tokens": 6637526.0, "step": 1206, "train/ce_loss": 0.7613099217414856 }, { "epoch": 0.11924065651572079, "step": 1206, "train/sim_loss": 0.1015625 }, { "epoch": 0.11924065651572079, "step": 1206, "train/total_loss": 0.17769348621368408 }, { "entropy": 9.408955574035645, "epoch": 0.1193395293652363, "mean_token_accuracy": 0.7485648393630981, "num_tokens": 6642919.0, "step": 1207, "train/ce_loss": 0.6442524194717407 }, { "epoch": 0.1193395293652363, "step": 1207, "train/sim_loss": 0.1015625 }, { "epoch": 0.1193395293652363, "step": 1207, "train/total_loss": 0.1659877449274063 }, { "entropy": 9.0310640335083, "epoch": 0.11943840221475183, "mean_token_accuracy": 0.695966899394989, "num_tokens": 6648423.0, "step": 1208, "train/ce_loss": 0.9464987516403198 }, { "epoch": 0.11943840221475183, "step": 1208, "train/sim_loss": 0.1328125 }, { "epoch": 0.11943840221475183, "step": 1208, "train/total_loss": 0.22746238112449646 }, { "entropy": 9.465675354003906, "epoch": 0.11953727506426735, "mean_token_accuracy": 0.7624161243438721, "num_tokens": 6653781.0, "step": 1209, "train/ce_loss": 0.9205053448677063 }, { "epoch": 0.11953727506426735, "step": 1209, "train/sim_loss": 0.05078125 }, { "epoch": 0.11953727506426735, "step": 1209, "train/total_loss": 0.14283178746700287 }, { "entropy": 8.981742858886719, "epoch": 0.11963614791378288, "mean_token_accuracy": 0.7051546573638916, "num_tokens": 6659344.0, "step": 1210, "train/ce_loss": 1.1661628484725952 }, { "epoch": 0.11963614791378288, "step": 1210, "train/sim_loss": 0.171875 }, { "epoch": 0.11963614791378288, "step": 1210, "train/total_loss": 0.28849127888679504 }, { "entropy": 9.208793640136719, "epoch": 0.1197350207632984, "mean_token_accuracy": 0.7680826783180237, "num_tokens": 6664850.0, "step": 1211, "train/ce_loss": 0.3852444589138031 }, { "epoch": 0.1197350207632984, "step": 1211, "train/sim_loss": 0.05078125 }, { "epoch": 0.1197350207632984, "step": 1211, "train/total_loss": 0.08930569887161255 }, { "entropy": 8.52829647064209, "epoch": 0.11983389361281392, "mean_token_accuracy": 0.7145073413848877, "num_tokens": 6670722.0, "step": 1212, "train/ce_loss": 0.6411594152450562 }, { "epoch": 0.11983389361281392, "step": 1212, "train/sim_loss": 0.08984375 }, { "epoch": 0.11983389361281392, "step": 1212, "train/total_loss": 0.15395969152450562 }, { "entropy": 9.584680557250977, "epoch": 0.11993276646232945, "mean_token_accuracy": 0.7510259747505188, "num_tokens": 6676006.0, "step": 1213, "train/ce_loss": 0.4925919473171234 }, { "epoch": 0.11993276646232945, "step": 1213, "train/sim_loss": 0.02734375 }, { "epoch": 0.11993276646232945, "step": 1213, "train/total_loss": 0.07660295069217682 }, { "entropy": 9.468073844909668, "epoch": 0.12003163931184496, "mean_token_accuracy": 0.7317073345184326, "num_tokens": 6681318.0, "step": 1214, "train/ce_loss": 1.2397695779800415 }, { "epoch": 0.12003163931184496, "step": 1214, "train/sim_loss": 0.109375 }, { "epoch": 0.12003163931184496, "step": 1214, "train/total_loss": 0.2333519607782364 }, { "entropy": 9.239408493041992, "epoch": 0.12013051216136049, "mean_token_accuracy": 0.755750298500061, "num_tokens": 6686837.0, "step": 1215, "train/ce_loss": 0.8917558789253235 }, { "epoch": 0.12013051216136049, "step": 1215, "train/sim_loss": 0.203125 }, { "epoch": 0.12013051216136049, "step": 1215, "train/total_loss": 0.29230058193206787 }, { "entropy": 9.470235824584961, "epoch": 0.12022938501087602, "mean_token_accuracy": 0.695708692073822, "num_tokens": 6692172.0, "step": 1216, "train/ce_loss": 0.6021824479103088 }, { "epoch": 0.12022938501087602, "step": 1216, "train/sim_loss": 0.109375 }, { "epoch": 0.12022938501087602, "step": 1216, "train/total_loss": 0.16959324479103088 }, { "entropy": 9.335431098937988, "epoch": 0.12032825786039153, "mean_token_accuracy": 0.7259615659713745, "num_tokens": 6697591.0, "step": 1217, "train/ce_loss": 0.7047808766365051 }, { "epoch": 0.12032825786039153, "step": 1217, "train/sim_loss": 0.08984375 }, { "epoch": 0.12032825786039153, "step": 1217, "train/total_loss": 0.16032183170318604 }, { "entropy": 9.323629379272461, "epoch": 0.12042713070990706, "mean_token_accuracy": 0.7625298500061035, "num_tokens": 6703005.0, "step": 1218, "train/ce_loss": 0.5070896148681641 }, { "epoch": 0.12042713070990706, "step": 1218, "train/sim_loss": 0.0625 }, { "epoch": 0.12042713070990706, "step": 1218, "train/total_loss": 0.11320896446704865 }, { "entropy": 9.382975578308105, "epoch": 0.12052600355942258, "mean_token_accuracy": 0.739237368106842, "num_tokens": 6708388.0, "step": 1219, "train/ce_loss": 0.9092739820480347 }, { "epoch": 0.12052600355942258, "step": 1219, "train/sim_loss": 0.0859375 }, { "epoch": 0.12052600355942258, "step": 1219, "train/total_loss": 0.176864892244339 }, { "epoch": 0.12062487640893811, "grad_norm": 1.0895336866378784, "learning_rate": 9.701082925381992e-06, "loss": 0.169, "step": 1220 }, { "entropy": 9.465570449829102, "epoch": 0.12062487640893811, "mean_token_accuracy": 0.7506596446037292, "num_tokens": 6713757.0, "step": 1220, "train/ce_loss": 0.7736036777496338 }, { "epoch": 0.12062487640893811, "step": 1220, "train/sim_loss": 0.0859375 }, { "epoch": 0.12062487640893811, "step": 1220, "train/total_loss": 0.1632978618144989 }, { "entropy": 9.453868865966797, "epoch": 0.12072374925845362, "mean_token_accuracy": 0.7448717951774597, "num_tokens": 6719168.0, "step": 1221, "train/ce_loss": 1.068414568901062 }, { "epoch": 0.12072374925845362, "step": 1221, "train/sim_loss": 0.08984375 }, { "epoch": 0.12072374925845362, "step": 1221, "train/total_loss": 0.19668520987033844 }, { "entropy": 9.179931640625, "epoch": 0.12082262210796915, "mean_token_accuracy": 0.7194244861602783, "num_tokens": 6724586.0, "step": 1222, "train/ce_loss": 0.6046621799468994 }, { "epoch": 0.12082262210796915, "step": 1222, "train/sim_loss": 0.0703125 }, { "epoch": 0.12082262210796915, "step": 1222, "train/total_loss": 0.1307787150144577 }, { "entropy": 9.259254455566406, "epoch": 0.12092149495748468, "mean_token_accuracy": 0.7191630005836487, "num_tokens": 6730085.0, "step": 1223, "train/ce_loss": 1.103924036026001 }, { "epoch": 0.12092149495748468, "step": 1223, "train/sim_loss": 0.125 }, { "epoch": 0.12092149495748468, "step": 1223, "train/total_loss": 0.23539240658283234 }, { "entropy": 9.388534545898438, "epoch": 0.12102036780700019, "mean_token_accuracy": 0.7366197109222412, "num_tokens": 6735386.0, "step": 1224, "train/ce_loss": 0.8050982356071472 }, { "epoch": 0.12102036780700019, "step": 1224, "train/sim_loss": 0.07421875 }, { "epoch": 0.12102036780700019, "step": 1224, "train/total_loss": 0.15472857654094696 }, { "entropy": 9.436098098754883, "epoch": 0.12111924065651572, "mean_token_accuracy": 0.7809523940086365, "num_tokens": 6740723.0, "step": 1225, "train/ce_loss": 0.6282393932342529 }, { "epoch": 0.12111924065651572, "step": 1225, "train/sim_loss": 0.14453125 }, { "epoch": 0.12111924065651572, "step": 1225, "train/total_loss": 0.20735520124435425 }, { "entropy": 9.469380378723145, "epoch": 0.12121811350603125, "mean_token_accuracy": 0.7330508232116699, "num_tokens": 6746079.0, "step": 1226, "train/ce_loss": 0.8732038736343384 }, { "epoch": 0.12121811350603125, "step": 1226, "train/sim_loss": 0.08203125 }, { "epoch": 0.12121811350603125, "step": 1226, "train/total_loss": 0.16935163736343384 }, { "entropy": 9.346443176269531, "epoch": 0.12131698635554676, "mean_token_accuracy": 0.7373211979866028, "num_tokens": 6751456.0, "step": 1227, "train/ce_loss": 0.714235782623291 }, { "epoch": 0.12131698635554676, "step": 1227, "train/sim_loss": 0.0625 }, { "epoch": 0.12131698635554676, "step": 1227, "train/total_loss": 0.13392359018325806 }, { "entropy": 9.454696655273438, "epoch": 0.12141585920506229, "mean_token_accuracy": 0.7690355181694031, "num_tokens": 6756864.0, "step": 1228, "train/ce_loss": 0.3660595118999481 }, { "epoch": 0.12141585920506229, "step": 1228, "train/sim_loss": 0.06640625 }, { "epoch": 0.12141585920506229, "step": 1228, "train/total_loss": 0.10301220417022705 }, { "entropy": 9.439504623413086, "epoch": 0.12151473205457781, "mean_token_accuracy": 0.7509578466415405, "num_tokens": 6762223.0, "step": 1229, "train/ce_loss": 0.9472532868385315 }, { "epoch": 0.12151473205457781, "step": 1229, "train/sim_loss": 0.05078125 }, { "epoch": 0.12151473205457781, "step": 1229, "train/total_loss": 0.1455065906047821 }, { "entropy": 9.073904991149902, "epoch": 0.12161360490409334, "mean_token_accuracy": 0.6980676054954529, "num_tokens": 6767572.0, "step": 1230, "train/ce_loss": 0.7551958560943604 }, { "epoch": 0.12161360490409334, "step": 1230, "train/sim_loss": 0.140625 }, { "epoch": 0.12161360490409334, "step": 1230, "train/total_loss": 0.2161445915699005 }, { "entropy": 8.975247383117676, "epoch": 0.12171247775360886, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 6773274.0, "step": 1231, "train/ce_loss": 1.6073322296142578 }, { "epoch": 0.12171247775360886, "step": 1231, "train/sim_loss": 0.07421875 }, { "epoch": 0.12171247775360886, "step": 1231, "train/total_loss": 0.23495197296142578 }, { "entropy": 9.176767349243164, "epoch": 0.12181135060312438, "mean_token_accuracy": 0.6834763884544373, "num_tokens": 6778837.0, "step": 1232, "train/ce_loss": 0.822350800037384 }, { "epoch": 0.12181135060312438, "step": 1232, "train/sim_loss": 0.09375 }, { "epoch": 0.12181135060312438, "step": 1232, "train/total_loss": 0.17598508298397064 }, { "entropy": 9.353265762329102, "epoch": 0.12191022345263991, "mean_token_accuracy": 0.7544757127761841, "num_tokens": 6784291.0, "step": 1233, "train/ce_loss": 0.809382975101471 }, { "epoch": 0.12191022345263991, "step": 1233, "train/sim_loss": 0.09375 }, { "epoch": 0.12191022345263991, "step": 1233, "train/total_loss": 0.17468830943107605 }, { "entropy": 9.118939399719238, "epoch": 0.12200909630215542, "mean_token_accuracy": 0.738071084022522, "num_tokens": 6789894.0, "step": 1234, "train/ce_loss": 0.7437105178833008 }, { "epoch": 0.12200909630215542, "step": 1234, "train/sim_loss": 0.0546875 }, { "epoch": 0.12200909630215542, "step": 1234, "train/total_loss": 0.12905855476856232 }, { "entropy": 9.424604415893555, "epoch": 0.12210796915167095, "mean_token_accuracy": 0.75, "num_tokens": 6795226.0, "step": 1235, "train/ce_loss": 0.6040997505187988 }, { "epoch": 0.12210796915167095, "step": 1235, "train/sim_loss": 0.046875 }, { "epoch": 0.12210796915167095, "step": 1235, "train/total_loss": 0.10728497803211212 }, { "entropy": 8.995025634765625, "epoch": 0.12220684200118648, "mean_token_accuracy": 0.7612704634666443, "num_tokens": 6800886.0, "step": 1236, "train/ce_loss": 0.5682891011238098 }, { "epoch": 0.12220684200118648, "step": 1236, "train/sim_loss": 0.109375 }, { "epoch": 0.12220684200118648, "step": 1236, "train/total_loss": 0.16620391607284546 }, { "entropy": 9.486865997314453, "epoch": 0.12230571485070199, "mean_token_accuracy": 0.7152777910232544, "num_tokens": 6806239.0, "step": 1237, "train/ce_loss": 0.6640860438346863 }, { "epoch": 0.12230571485070199, "step": 1237, "train/sim_loss": 0.0859375 }, { "epoch": 0.12230571485070199, "step": 1237, "train/total_loss": 0.15234610438346863 }, { "entropy": 9.176627159118652, "epoch": 0.12240458770021752, "mean_token_accuracy": 0.6740976572036743, "num_tokens": 6811830.0, "step": 1238, "train/ce_loss": 0.9452435374259949 }, { "epoch": 0.12240458770021752, "step": 1238, "train/sim_loss": 0.05859375 }, { "epoch": 0.12240458770021752, "step": 1238, "train/total_loss": 0.1531181037425995 }, { "entropy": 9.324590682983398, "epoch": 0.12250346054973305, "mean_token_accuracy": 0.769626796245575, "num_tokens": 6817270.0, "step": 1239, "train/ce_loss": 0.7870416045188904 }, { "epoch": 0.12250346054973305, "step": 1239, "train/sim_loss": 0.0703125 }, { "epoch": 0.12250346054973305, "step": 1239, "train/total_loss": 0.14901666343212128 }, { "epoch": 0.12260233339924857, "grad_norm": 0.9433260560035706, "learning_rate": 9.696138060624043e-06, "loss": 0.1704, "step": 1240 }, { "entropy": 9.388139724731445, "epoch": 0.12260233339924857, "mean_token_accuracy": 0.70567786693573, "num_tokens": 6822769.0, "step": 1240, "train/ce_loss": 0.36732247471809387 }, { "epoch": 0.12260233339924857, "step": 1240, "train/sim_loss": 0.0546875 }, { "epoch": 0.12260233339924857, "step": 1240, "train/total_loss": 0.0914197489619255 }, { "entropy": 8.856678009033203, "epoch": 0.12270120624876409, "mean_token_accuracy": 0.7308333516120911, "num_tokens": 6828799.0, "step": 1241, "train/ce_loss": 0.9606333374977112 }, { "epoch": 0.12270120624876409, "step": 1241, "train/sim_loss": 0.078125 }, { "epoch": 0.12270120624876409, "step": 1241, "train/total_loss": 0.17418834567070007 }, { "entropy": 8.878345489501953, "epoch": 0.12280007909827961, "mean_token_accuracy": 0.7285463809967041, "num_tokens": 6834513.0, "step": 1242, "train/ce_loss": 0.5847339630126953 }, { "epoch": 0.12280007909827961, "step": 1242, "train/sim_loss": 0.0625 }, { "epoch": 0.12280007909827961, "step": 1242, "train/total_loss": 0.12097339332103729 }, { "entropy": 9.26901626586914, "epoch": 0.12289895194779514, "mean_token_accuracy": 0.7979333996772766, "num_tokens": 6840038.0, "step": 1243, "train/ce_loss": 0.5028206706047058 }, { "epoch": 0.12289895194779514, "step": 1243, "train/sim_loss": 0.09375 }, { "epoch": 0.12289895194779514, "step": 1243, "train/total_loss": 0.1440320611000061 }, { "entropy": 9.41348648071289, "epoch": 0.12299782479731065, "mean_token_accuracy": 0.6974219679832458, "num_tokens": 6845420.0, "step": 1244, "train/ce_loss": 1.2198346853256226 }, { "epoch": 0.12299782479731065, "step": 1244, "train/sim_loss": 0.0546875 }, { "epoch": 0.12299782479731065, "step": 1244, "train/total_loss": 0.17667096853256226 }, { "entropy": 9.201607704162598, "epoch": 0.12309669764682618, "mean_token_accuracy": 0.7422459721565247, "num_tokens": 6850918.0, "step": 1245, "train/ce_loss": 0.8049962520599365 }, { "epoch": 0.12309669764682618, "step": 1245, "train/sim_loss": 0.0859375 }, { "epoch": 0.12309669764682618, "step": 1245, "train/total_loss": 0.16643711924552917 }, { "entropy": 9.345949172973633, "epoch": 0.12319557049634171, "mean_token_accuracy": 0.6945454478263855, "num_tokens": 6856400.0, "step": 1246, "train/ce_loss": 0.9168776869773865 }, { "epoch": 0.12319557049634171, "step": 1246, "train/sim_loss": 0.11328125 }, { "epoch": 0.12319557049634171, "step": 1246, "train/total_loss": 0.20496901869773865 }, { "entropy": 9.256263732910156, "epoch": 0.12329444334585722, "mean_token_accuracy": 0.749171257019043, "num_tokens": 6861943.0, "step": 1247, "train/ce_loss": 0.8629254102706909 }, { "epoch": 0.12329444334585722, "step": 1247, "train/sim_loss": 0.0625 }, { "epoch": 0.12329444334585722, "step": 1247, "train/total_loss": 0.14879253506660461 }, { "entropy": 9.065906524658203, "epoch": 0.12339331619537275, "mean_token_accuracy": 0.7372727394104004, "num_tokens": 6867645.0, "step": 1248, "train/ce_loss": 0.43920832872390747 }, { "epoch": 0.12339331619537275, "step": 1248, "train/sim_loss": 0.0625 }, { "epoch": 0.12339331619537275, "step": 1248, "train/total_loss": 0.10642082989215851 }, { "entropy": 9.029417037963867, "epoch": 0.12349218904488828, "mean_token_accuracy": 0.731501042842865, "num_tokens": 6873143.0, "step": 1249, "train/ce_loss": 0.8172164559364319 }, { "epoch": 0.12349218904488828, "step": 1249, "train/sim_loss": 0.0625 }, { "epoch": 0.12349218904488828, "step": 1249, "train/total_loss": 0.14422164857387543 }, { "entropy": 9.454422950744629, "epoch": 0.1235910618944038, "mean_token_accuracy": 0.725784420967102, "num_tokens": 6878522.0, "step": 1250, "train/ce_loss": 0.655506432056427 }, { "epoch": 0.1235910618944038, "step": 1250, "train/sim_loss": 0.05078125 }, { "epoch": 0.1235910618944038, "step": 1250, "train/total_loss": 0.11633189767599106 }, { "entropy": 9.003074645996094, "epoch": 0.12368993474391932, "mean_token_accuracy": 0.7710501551628113, "num_tokens": 6884172.0, "step": 1251, "train/ce_loss": 0.889735758304596 }, { "epoch": 0.12368993474391932, "step": 1251, "train/sim_loss": 0.05859375 }, { "epoch": 0.12368993474391932, "step": 1251, "train/total_loss": 0.14756733179092407 }, { "entropy": 9.359220504760742, "epoch": 0.12378880759343484, "mean_token_accuracy": 0.7386215925216675, "num_tokens": 6889586.0, "step": 1252, "train/ce_loss": 0.5373151302337646 }, { "epoch": 0.12378880759343484, "step": 1252, "train/sim_loss": 0.05078125 }, { "epoch": 0.12378880759343484, "step": 1252, "train/total_loss": 0.1045127660036087 }, { "entropy": 9.361841201782227, "epoch": 0.12388768044295037, "mean_token_accuracy": 0.73072749376297, "num_tokens": 6895072.0, "step": 1253, "train/ce_loss": 0.805940568447113 }, { "epoch": 0.12388768044295037, "step": 1253, "train/sim_loss": 0.0703125 }, { "epoch": 0.12388768044295037, "step": 1253, "train/total_loss": 0.15090656280517578 }, { "entropy": 9.534307479858398, "epoch": 0.12398655329246588, "mean_token_accuracy": 0.772455096244812, "num_tokens": 6900626.0, "step": 1254, "train/ce_loss": 0.8410473465919495 }, { "epoch": 0.12398655329246588, "step": 1254, "train/sim_loss": 0.03125 }, { "epoch": 0.12398655329246588, "step": 1254, "train/total_loss": 0.1153547391295433 }, { "entropy": 9.26573371887207, "epoch": 0.12408542614198141, "mean_token_accuracy": 0.7477375268936157, "num_tokens": 6906172.0, "step": 1255, "train/ce_loss": 0.676632285118103 }, { "epoch": 0.12408542614198141, "step": 1255, "train/sim_loss": 0.07421875 }, { "epoch": 0.12408542614198141, "step": 1255, "train/total_loss": 0.14188197255134583 }, { "entropy": 9.362611770629883, "epoch": 0.12418429899149694, "mean_token_accuracy": 0.7800453305244446, "num_tokens": 6911677.0, "step": 1256, "train/ce_loss": 0.7088720202445984 }, { "epoch": 0.12418429899149694, "step": 1256, "train/sim_loss": 0.04296875 }, { "epoch": 0.12418429899149694, "step": 1256, "train/total_loss": 0.11385595053434372 }, { "entropy": 9.075122833251953, "epoch": 0.12428317184101245, "mean_token_accuracy": 0.7351351380348206, "num_tokens": 6917229.0, "step": 1257, "train/ce_loss": 0.7109876275062561 }, { "epoch": 0.12428317184101245, "step": 1257, "train/sim_loss": 0.11328125 }, { "epoch": 0.12428317184101245, "step": 1257, "train/total_loss": 0.18438002467155457 }, { "entropy": 9.387807846069336, "epoch": 0.12438204469052798, "mean_token_accuracy": 0.6936936974525452, "num_tokens": 6922701.0, "step": 1258, "train/ce_loss": 1.5460973978042603 }, { "epoch": 0.12438204469052798, "step": 1258, "train/sim_loss": 0.09765625 }, { "epoch": 0.12438204469052798, "step": 1258, "train/total_loss": 0.252265989780426 }, { "entropy": 9.19167709350586, "epoch": 0.1244809175400435, "mean_token_accuracy": 0.7427536249160767, "num_tokens": 6928117.0, "step": 1259, "train/ce_loss": 0.8916782736778259 }, { "epoch": 0.1244809175400435, "step": 1259, "train/sim_loss": 0.140625 }, { "epoch": 0.1244809175400435, "step": 1259, "train/total_loss": 0.22979283332824707 }, { "epoch": 0.12457979038955903, "grad_norm": 0.9852316379547119, "learning_rate": 9.691193195866095e-06, "loss": 0.1602, "step": 1260 }, { "entropy": 9.51093864440918, "epoch": 0.12457979038955903, "mean_token_accuracy": 0.73041170835495, "num_tokens": 6933510.0, "step": 1260, "train/ce_loss": 0.9200326204299927 }, { "epoch": 0.12457979038955903, "step": 1260, "train/sim_loss": 0.06640625 }, { "epoch": 0.12457979038955903, "step": 1260, "train/total_loss": 0.1584095060825348 }, { "entropy": 9.303045272827148, "epoch": 0.12467866323907455, "mean_token_accuracy": 0.7980653047561646, "num_tokens": 6938952.0, "step": 1261, "train/ce_loss": 0.36515629291534424 }, { "epoch": 0.12467866323907455, "step": 1261, "train/sim_loss": 0.0234375 }, { "epoch": 0.12467866323907455, "step": 1261, "train/total_loss": 0.05995313078165054 }, { "entropy": 9.602996826171875, "epoch": 0.12477753608859007, "mean_token_accuracy": 0.75, "num_tokens": 6944195.0, "step": 1262, "train/ce_loss": 0.7286627292633057 }, { "epoch": 0.12477753608859007, "step": 1262, "train/sim_loss": 0.0546875 }, { "epoch": 0.12477753608859007, "step": 1262, "train/total_loss": 0.1275537759065628 }, { "entropy": 9.382024765014648, "epoch": 0.1248764089381056, "mean_token_accuracy": 0.7490820288658142, "num_tokens": 6949544.0, "step": 1263, "train/ce_loss": 1.2428936958312988 }, { "epoch": 0.1248764089381056, "step": 1263, "train/sim_loss": 0.12890625 }, { "epoch": 0.1248764089381056, "step": 1263, "train/total_loss": 0.2531956136226654 }, { "entropy": 9.082314491271973, "epoch": 0.12497528178762111, "mean_token_accuracy": 0.7622298002243042, "num_tokens": 6955038.0, "step": 1264, "train/ce_loss": 0.8522223830223083 }, { "epoch": 0.12497528178762111, "step": 1264, "train/sim_loss": 0.06640625 }, { "epoch": 0.12497528178762111, "step": 1264, "train/total_loss": 0.1516284942626953 }, { "entropy": 9.277280807495117, "epoch": 0.12507415463713664, "mean_token_accuracy": 0.7811791300773621, "num_tokens": 6960525.0, "step": 1265, "train/ce_loss": 0.7528575658798218 }, { "epoch": 0.12507415463713664, "step": 1265, "train/sim_loss": 0.0703125 }, { "epoch": 0.12507415463713664, "step": 1265, "train/total_loss": 0.14559826254844666 }, { "entropy": 9.367176055908203, "epoch": 0.12517302748665216, "mean_token_accuracy": 0.7554945349693298, "num_tokens": 6965892.0, "step": 1266, "train/ce_loss": 0.9208450317382812 }, { "epoch": 0.12517302748665216, "step": 1266, "train/sim_loss": 0.109375 }, { "epoch": 0.12517302748665216, "step": 1266, "train/total_loss": 0.20145949721336365 }, { "entropy": 9.566551208496094, "epoch": 0.1252719003361677, "mean_token_accuracy": 0.7085714340209961, "num_tokens": 6971186.0, "step": 1267, "train/ce_loss": 1.0691659450531006 }, { "epoch": 0.1252719003361677, "step": 1267, "train/sim_loss": 0.11328125 }, { "epoch": 0.1252719003361677, "step": 1267, "train/total_loss": 0.220197856426239 }, { "entropy": 9.436025619506836, "epoch": 0.1253707731856832, "mean_token_accuracy": 0.7324455380439758, "num_tokens": 6976566.0, "step": 1268, "train/ce_loss": 0.8160316944122314 }, { "epoch": 0.1253707731856832, "step": 1268, "train/sim_loss": 0.06640625 }, { "epoch": 0.1253707731856832, "step": 1268, "train/total_loss": 0.14800941944122314 }, { "entropy": 9.2811279296875, "epoch": 0.12546964603519872, "mean_token_accuracy": 0.7503045201301575, "num_tokens": 6982022.0, "step": 1269, "train/ce_loss": 0.7008271217346191 }, { "epoch": 0.12546964603519872, "step": 1269, "train/sim_loss": 0.0546875 }, { "epoch": 0.12546964603519872, "step": 1269, "train/total_loss": 0.12477021664381027 }, { "entropy": 9.501692771911621, "epoch": 0.12556851888471426, "mean_token_accuracy": 0.7055630683898926, "num_tokens": 6987399.0, "step": 1270, "train/ce_loss": 0.6716254949569702 }, { "epoch": 0.12556851888471426, "step": 1270, "train/sim_loss": 0.0625 }, { "epoch": 0.12556851888471426, "step": 1270, "train/total_loss": 0.12966254353523254 }, { "entropy": 9.294499397277832, "epoch": 0.12566739173422978, "mean_token_accuracy": 0.7287299633026123, "num_tokens": 6992753.0, "step": 1271, "train/ce_loss": 0.8194760680198669 }, { "epoch": 0.12566739173422978, "step": 1271, "train/sim_loss": 0.11328125 }, { "epoch": 0.12566739173422978, "step": 1271, "train/total_loss": 0.19522885978221893 }, { "entropy": 9.52720832824707, "epoch": 0.1257662645837453, "mean_token_accuracy": 0.7698744535446167, "num_tokens": 6998033.0, "step": 1272, "train/ce_loss": 1.0132781267166138 }, { "epoch": 0.1257662645837453, "step": 1272, "train/sim_loss": 0.08203125 }, { "epoch": 0.1257662645837453, "step": 1272, "train/total_loss": 0.1833590567111969 }, { "entropy": 9.10108470916748, "epoch": 0.12586513743326083, "mean_token_accuracy": 0.7395715713500977, "num_tokens": 7003566.0, "step": 1273, "train/ce_loss": 1.1197662353515625 }, { "epoch": 0.12586513743326083, "step": 1273, "train/sim_loss": 0.08203125 }, { "epoch": 0.12586513743326083, "step": 1273, "train/total_loss": 0.19400787353515625 }, { "entropy": 9.401548385620117, "epoch": 0.12596401028277635, "mean_token_accuracy": 0.7599999904632568, "num_tokens": 7008890.0, "step": 1274, "train/ce_loss": 0.5323838591575623 }, { "epoch": 0.12596401028277635, "step": 1274, "train/sim_loss": 0.09375 }, { "epoch": 0.12596401028277635, "step": 1274, "train/total_loss": 0.1469883918762207 }, { "entropy": 9.598976135253906, "epoch": 0.1260628831322919, "mean_token_accuracy": 0.7252124547958374, "num_tokens": 7014110.0, "step": 1275, "train/ce_loss": 1.2816715240478516 }, { "epoch": 0.1260628831322919, "step": 1275, "train/sim_loss": 0.13671875 }, { "epoch": 0.1260628831322919, "step": 1275, "train/total_loss": 0.26488590240478516 }, { "entropy": 9.380779266357422, "epoch": 0.1261617559818074, "mean_token_accuracy": 0.7259158492088318, "num_tokens": 7019478.0, "step": 1276, "train/ce_loss": 0.6786596179008484 }, { "epoch": 0.1261617559818074, "step": 1276, "train/sim_loss": 0.10546875 }, { "epoch": 0.1261617559818074, "step": 1276, "train/total_loss": 0.17333471775054932 }, { "entropy": 9.503196716308594, "epoch": 0.1262606288313229, "mean_token_accuracy": 0.7322335243225098, "num_tokens": 7024828.0, "step": 1277, "train/ce_loss": 0.8124256134033203 }, { "epoch": 0.1262606288313229, "step": 1277, "train/sim_loss": 0.1015625 }, { "epoch": 0.1262606288313229, "step": 1277, "train/total_loss": 0.18280506134033203 }, { "entropy": 8.968796730041504, "epoch": 0.12635950168083845, "mean_token_accuracy": 0.7747196555137634, "num_tokens": 7030403.0, "step": 1278, "train/ce_loss": 0.7540860772132874 }, { "epoch": 0.12635950168083845, "step": 1278, "train/sim_loss": 0.03125 }, { "epoch": 0.12635950168083845, "step": 1278, "train/total_loss": 0.10665860772132874 }, { "entropy": 9.263203620910645, "epoch": 0.12645837453035397, "mean_token_accuracy": 0.7401197552680969, "num_tokens": 7035877.0, "step": 1279, "train/ce_loss": 0.8757287859916687 }, { "epoch": 0.12645837453035397, "step": 1279, "train/sim_loss": 0.09375 }, { "epoch": 0.12645837453035397, "step": 1279, "train/total_loss": 0.1813228726387024 }, { "epoch": 0.12655724737986948, "grad_norm": 1.0084494352340698, "learning_rate": 9.686248331108144e-06, "loss": 0.1585, "step": 1280 }, { "entropy": 9.020414352416992, "epoch": 0.12655724737986948, "mean_token_accuracy": 0.7691511511802673, "num_tokens": 7041473.0, "step": 1280, "train/ce_loss": 0.5087365508079529 }, { "epoch": 0.12655724737986948, "step": 1280, "train/sim_loss": 0.03515625 }, { "epoch": 0.12655724737986948, "step": 1280, "train/total_loss": 0.08602990210056305 }, { "entropy": 9.218118667602539, "epoch": 0.12665612022938502, "mean_token_accuracy": 0.7325443625450134, "num_tokens": 7046902.0, "step": 1281, "train/ce_loss": 1.012427568435669 }, { "epoch": 0.12665612022938502, "step": 1281, "train/sim_loss": 0.08203125 }, { "epoch": 0.12665612022938502, "step": 1281, "train/total_loss": 0.18327400088310242 }, { "entropy": 9.406867980957031, "epoch": 0.12675499307890054, "mean_token_accuracy": 0.7431192398071289, "num_tokens": 7052475.0, "step": 1282, "train/ce_loss": 1.1164335012435913 }, { "epoch": 0.12675499307890054, "step": 1282, "train/sim_loss": 0.09765625 }, { "epoch": 0.12675499307890054, "step": 1282, "train/total_loss": 0.20929959416389465 }, { "entropy": 9.11520767211914, "epoch": 0.12685386592841605, "mean_token_accuracy": 0.7905759215354919, "num_tokens": 7057992.0, "step": 1283, "train/ce_loss": 0.4309730529785156 }, { "epoch": 0.12685386592841605, "step": 1283, "train/sim_loss": 0.03515625 }, { "epoch": 0.12685386592841605, "step": 1283, "train/total_loss": 0.07825355231761932 }, { "entropy": 9.277780532836914, "epoch": 0.1269527387779316, "mean_token_accuracy": 0.7177143096923828, "num_tokens": 7063531.0, "step": 1284, "train/ce_loss": 0.9299321174621582 }, { "epoch": 0.1269527387779316, "step": 1284, "train/sim_loss": 0.09375 }, { "epoch": 0.1269527387779316, "step": 1284, "train/total_loss": 0.18674321472644806 }, { "entropy": 9.094381332397461, "epoch": 0.1270516116274471, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 7069003.0, "step": 1285, "train/ce_loss": 0.9346034526824951 }, { "epoch": 0.1270516116274471, "step": 1285, "train/sim_loss": 0.125 }, { "epoch": 0.1270516116274471, "step": 1285, "train/total_loss": 0.218460351228714 }, { "entropy": 8.953091621398926, "epoch": 0.12715048447696262, "mean_token_accuracy": 0.8129770755767822, "num_tokens": 7074703.0, "step": 1286, "train/ce_loss": 0.3743775486946106 }, { "epoch": 0.12715048447696262, "step": 1286, "train/sim_loss": 0.125 }, { "epoch": 0.12715048447696262, "step": 1286, "train/total_loss": 0.16243775188922882 }, { "entropy": 9.206169128417969, "epoch": 0.12724935732647816, "mean_token_accuracy": 0.7846153974533081, "num_tokens": 7080058.0, "step": 1287, "train/ce_loss": 0.6838846802711487 }, { "epoch": 0.12724935732647816, "step": 1287, "train/sim_loss": 0.0703125 }, { "epoch": 0.12724935732647816, "step": 1287, "train/total_loss": 0.1387009620666504 }, { "entropy": 9.297235488891602, "epoch": 0.12734823017599367, "mean_token_accuracy": 0.746760904788971, "num_tokens": 7085439.0, "step": 1288, "train/ce_loss": 0.5713454484939575 }, { "epoch": 0.12734823017599367, "step": 1288, "train/sim_loss": 0.03125 }, { "epoch": 0.12734823017599367, "step": 1288, "train/total_loss": 0.08838454633951187 }, { "entropy": 9.132660865783691, "epoch": 0.12744710302550918, "mean_token_accuracy": 0.752953827381134, "num_tokens": 7091191.0, "step": 1289, "train/ce_loss": 0.6702521443367004 }, { "epoch": 0.12744710302550918, "step": 1289, "train/sim_loss": 0.11328125 }, { "epoch": 0.12744710302550918, "step": 1289, "train/total_loss": 0.18030646443367004 }, { "entropy": 9.542570114135742, "epoch": 0.12754597587502473, "mean_token_accuracy": 0.7393767833709717, "num_tokens": 7096478.0, "step": 1290, "train/ce_loss": 0.7676664590835571 }, { "epoch": 0.12754597587502473, "step": 1290, "train/sim_loss": 0.0859375 }, { "epoch": 0.12754597587502473, "step": 1290, "train/total_loss": 0.16270413994789124 }, { "entropy": 9.287704467773438, "epoch": 0.12764484872454024, "mean_token_accuracy": 0.7754601240158081, "num_tokens": 7101911.0, "step": 1291, "train/ce_loss": 0.6094826459884644 }, { "epoch": 0.12764484872454024, "step": 1291, "train/sim_loss": 0.11328125 }, { "epoch": 0.12764484872454024, "step": 1291, "train/total_loss": 0.17422951757907867 }, { "entropy": 9.030458450317383, "epoch": 0.12774372157405575, "mean_token_accuracy": 0.6783754229545593, "num_tokens": 7107448.0, "step": 1292, "train/ce_loss": 0.8474029302597046 }, { "epoch": 0.12774372157405575, "step": 1292, "train/sim_loss": 0.0546875 }, { "epoch": 0.12774372157405575, "step": 1292, "train/total_loss": 0.1394277960062027 }, { "entropy": 9.437826156616211, "epoch": 0.1278425944235713, "mean_token_accuracy": 0.7421758770942688, "num_tokens": 7112694.0, "step": 1293, "train/ce_loss": 0.7044265270233154 }, { "epoch": 0.1278425944235713, "step": 1293, "train/sim_loss": 0.08984375 }, { "epoch": 0.1278425944235713, "step": 1293, "train/total_loss": 0.16028639674186707 }, { "entropy": 9.388540267944336, "epoch": 0.1279414672730868, "mean_token_accuracy": 0.6758530139923096, "num_tokens": 7118058.0, "step": 1294, "train/ce_loss": 0.8560420870780945 }, { "epoch": 0.1279414672730868, "step": 1294, "train/sim_loss": 0.10546875 }, { "epoch": 0.1279414672730868, "step": 1294, "train/total_loss": 0.1910729706287384 }, { "entropy": 9.096719741821289, "epoch": 0.12804034012260232, "mean_token_accuracy": 0.7158119678497314, "num_tokens": 7123519.0, "step": 1295, "train/ce_loss": 1.0085091590881348 }, { "epoch": 0.12804034012260232, "step": 1295, "train/sim_loss": 0.0859375 }, { "epoch": 0.12804034012260232, "step": 1295, "train/total_loss": 0.186788409948349 }, { "entropy": 9.34440803527832, "epoch": 0.12813921297211786, "mean_token_accuracy": 0.702036440372467, "num_tokens": 7128988.0, "step": 1296, "train/ce_loss": 1.194170355796814 }, { "epoch": 0.12813921297211786, "step": 1296, "train/sim_loss": 0.109375 }, { "epoch": 0.12813921297211786, "step": 1296, "train/total_loss": 0.22879204154014587 }, { "entropy": 8.832083702087402, "epoch": 0.12823808582163337, "mean_token_accuracy": 0.727668821811676, "num_tokens": 7134534.0, "step": 1297, "train/ce_loss": 1.0494314432144165 }, { "epoch": 0.12823808582163337, "step": 1297, "train/sim_loss": 0.09375 }, { "epoch": 0.12823808582163337, "step": 1297, "train/total_loss": 0.1986931562423706 }, { "entropy": 9.100629806518555, "epoch": 0.12833695867114892, "mean_token_accuracy": 0.7856420874595642, "num_tokens": 7140364.0, "step": 1298, "train/ce_loss": 0.7436246275901794 }, { "epoch": 0.12833695867114892, "step": 1298, "train/sim_loss": 0.0390625 }, { "epoch": 0.12833695867114892, "step": 1298, "train/total_loss": 0.11342496424913406 }, { "entropy": 9.381462097167969, "epoch": 0.12843583152066443, "mean_token_accuracy": 0.75, "num_tokens": 7145682.0, "step": 1299, "train/ce_loss": 0.9730138778686523 }, { "epoch": 0.12843583152066443, "step": 1299, "train/sim_loss": 0.0703125 }, { "epoch": 0.12843583152066443, "step": 1299, "train/total_loss": 0.1676138937473297 }, { "epoch": 0.12853470437017994, "grad_norm": 0.9528304934501648, "learning_rate": 9.681303466350196e-06, "loss": 0.1656, "step": 1300 }, { "entropy": 9.15294075012207, "epoch": 0.12853470437017994, "mean_token_accuracy": 0.737574577331543, "num_tokens": 7151341.0, "step": 1300, "train/ce_loss": 0.3628046214580536 }, { "epoch": 0.12853470437017994, "step": 1300, "train/sim_loss": 0.0625 }, { "epoch": 0.12853470437017994, "step": 1300, "train/total_loss": 0.09878046810626984 }, { "entropy": 9.363946914672852, "epoch": 0.12863357721969548, "mean_token_accuracy": 0.7862949967384338, "num_tokens": 7156737.0, "step": 1301, "train/ce_loss": 0.6306625008583069 }, { "epoch": 0.12863357721969548, "step": 1301, "train/sim_loss": 0.0234375 }, { "epoch": 0.12863357721969548, "step": 1301, "train/total_loss": 0.08650375157594681 }, { "entropy": 9.251348495483398, "epoch": 0.128732450069211, "mean_token_accuracy": 0.7613365054130554, "num_tokens": 7162225.0, "step": 1302, "train/ce_loss": 0.452516108751297 }, { "epoch": 0.128732450069211, "step": 1302, "train/sim_loss": 0.0390625 }, { "epoch": 0.128732450069211, "step": 1302, "train/total_loss": 0.08431410789489746 }, { "entropy": 9.407377243041992, "epoch": 0.1288313229187265, "mean_token_accuracy": 0.7561608552932739, "num_tokens": 7167614.0, "step": 1303, "train/ce_loss": 0.6050813794136047 }, { "epoch": 0.1288313229187265, "step": 1303, "train/sim_loss": 0.078125 }, { "epoch": 0.1288313229187265, "step": 1303, "train/total_loss": 0.138633131980896 }, { "entropy": 8.905672073364258, "epoch": 0.12893019576824205, "mean_token_accuracy": 0.7451737523078918, "num_tokens": 7173301.0, "step": 1304, "train/ce_loss": 0.848776638507843 }, { "epoch": 0.12893019576824205, "step": 1304, "train/sim_loss": 0.06640625 }, { "epoch": 0.12893019576824205, "step": 1304, "train/total_loss": 0.15128391981124878 }, { "entropy": 9.01005744934082, "epoch": 0.12902906861775756, "mean_token_accuracy": 0.7082096934318542, "num_tokens": 7178887.0, "step": 1305, "train/ce_loss": 0.4581758677959442 }, { "epoch": 0.12902906861775756, "step": 1305, "train/sim_loss": 0.08203125 }, { "epoch": 0.12902906861775756, "step": 1305, "train/total_loss": 0.12784883379936218 }, { "entropy": 9.184677124023438, "epoch": 0.12912794146727308, "mean_token_accuracy": 0.7900113463401794, "num_tokens": 7184381.0, "step": 1306, "train/ce_loss": 0.5759574770927429 }, { "epoch": 0.12912794146727308, "step": 1306, "train/sim_loss": 0.08984375 }, { "epoch": 0.12912794146727308, "step": 1306, "train/total_loss": 0.14743949472904205 }, { "entropy": 9.15959644317627, "epoch": 0.12922681431678862, "mean_token_accuracy": 0.8029782176017761, "num_tokens": 7189894.0, "step": 1307, "train/ce_loss": 0.4694107472896576 }, { "epoch": 0.12922681431678862, "step": 1307, "train/sim_loss": 0.03125 }, { "epoch": 0.12922681431678862, "step": 1307, "train/total_loss": 0.07819107174873352 }, { "entropy": 9.145336151123047, "epoch": 0.12932568716630413, "mean_token_accuracy": 0.7427241206169128, "num_tokens": 7195383.0, "step": 1308, "train/ce_loss": 0.8108339309692383 }, { "epoch": 0.12932568716630413, "step": 1308, "train/sim_loss": 0.0546875 }, { "epoch": 0.12932568716630413, "step": 1308, "train/total_loss": 0.13577088713645935 }, { "entropy": 9.214110374450684, "epoch": 0.12942456001581965, "mean_token_accuracy": 0.7340686321258545, "num_tokens": 7200942.0, "step": 1309, "train/ce_loss": 0.46710458397865295 }, { "epoch": 0.12942456001581965, "step": 1309, "train/sim_loss": 0.0546875 }, { "epoch": 0.12942456001581965, "step": 1309, "train/total_loss": 0.10139796137809753 }, { "entropy": 9.277544021606445, "epoch": 0.1295234328653352, "mean_token_accuracy": 0.7592067718505859, "num_tokens": 7206258.0, "step": 1310, "train/ce_loss": 0.2762885093688965 }, { "epoch": 0.1295234328653352, "step": 1310, "train/sim_loss": 0.09375 }, { "epoch": 0.1295234328653352, "step": 1310, "train/total_loss": 0.12137885391712189 }, { "entropy": 9.065816879272461, "epoch": 0.1296223057148507, "mean_token_accuracy": 0.6956077814102173, "num_tokens": 7211824.0, "step": 1311, "train/ce_loss": 0.6651068925857544 }, { "epoch": 0.1296223057148507, "step": 1311, "train/sim_loss": 0.10546875 }, { "epoch": 0.1296223057148507, "step": 1311, "train/total_loss": 0.17197944223880768 }, { "entropy": 9.13143253326416, "epoch": 0.1297211785643662, "mean_token_accuracy": 0.7660753726959229, "num_tokens": 7217346.0, "step": 1312, "train/ce_loss": 0.8136122226715088 }, { "epoch": 0.1297211785643662, "step": 1312, "train/sim_loss": 0.1015625 }, { "epoch": 0.1297211785643662, "step": 1312, "train/total_loss": 0.18292373418807983 }, { "entropy": 9.203581809997559, "epoch": 0.12982005141388175, "mean_token_accuracy": 0.7467532753944397, "num_tokens": 7222773.0, "step": 1313, "train/ce_loss": 0.5560775399208069 }, { "epoch": 0.12982005141388175, "step": 1313, "train/sim_loss": 0.03515625 }, { "epoch": 0.12982005141388175, "step": 1313, "train/total_loss": 0.09076400101184845 }, { "entropy": 9.116201400756836, "epoch": 0.12991892426339727, "mean_token_accuracy": 0.7802197933197021, "num_tokens": 7228306.0, "step": 1314, "train/ce_loss": 0.663007915019989 }, { "epoch": 0.12991892426339727, "step": 1314, "train/sim_loss": 0.0703125 }, { "epoch": 0.12991892426339727, "step": 1314, "train/total_loss": 0.13661329448223114 }, { "entropy": 9.02784538269043, "epoch": 0.13001779711291278, "mean_token_accuracy": 0.7371727824211121, "num_tokens": 7233851.0, "step": 1315, "train/ce_loss": 1.258785367012024 }, { "epoch": 0.13001779711291278, "step": 1315, "train/sim_loss": 0.08984375 }, { "epoch": 0.13001779711291278, "step": 1315, "train/total_loss": 0.21572229266166687 }, { "entropy": 9.360983848571777, "epoch": 0.13011666996242832, "mean_token_accuracy": 0.7734375, "num_tokens": 7239171.0, "step": 1316, "train/ce_loss": 0.5523690581321716 }, { "epoch": 0.13011666996242832, "step": 1316, "train/sim_loss": 0.04296875 }, { "epoch": 0.13011666996242832, "step": 1316, "train/total_loss": 0.09820565581321716 }, { "entropy": 9.360960006713867, "epoch": 0.13021554281194384, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 7244548.0, "step": 1317, "train/ce_loss": 1.0498685836791992 }, { "epoch": 0.13021554281194384, "step": 1317, "train/sim_loss": 0.09375 }, { "epoch": 0.13021554281194384, "step": 1317, "train/total_loss": 0.19873686134815216 }, { "entropy": 9.130002975463867, "epoch": 0.13031441566145938, "mean_token_accuracy": 0.695035457611084, "num_tokens": 7250021.0, "step": 1318, "train/ce_loss": 0.7879402041435242 }, { "epoch": 0.13031441566145938, "step": 1318, "train/sim_loss": 0.0546875 }, { "epoch": 0.13031441566145938, "step": 1318, "train/total_loss": 0.13348153233528137 }, { "entropy": 9.387361526489258, "epoch": 0.1304132885109749, "mean_token_accuracy": 0.7731673717498779, "num_tokens": 7255349.0, "step": 1319, "train/ce_loss": 0.6303857564926147 }, { "epoch": 0.1304132885109749, "step": 1319, "train/sim_loss": 0.07421875 }, { "epoch": 0.1304132885109749, "step": 1319, "train/total_loss": 0.13725733757019043 }, { "epoch": 0.1305121613604904, "grad_norm": 0.9029608368873596, "learning_rate": 9.676358601592247e-06, "loss": 0.1536, "step": 1320 }, { "entropy": 9.514813423156738, "epoch": 0.1305121613604904, "mean_token_accuracy": 0.7083333134651184, "num_tokens": 7260688.0, "step": 1320, "train/ce_loss": 0.6481301784515381 }, { "epoch": 0.1305121613604904, "step": 1320, "train/sim_loss": 0.078125 }, { "epoch": 0.1305121613604904, "step": 1320, "train/total_loss": 0.1429380178451538 }, { "entropy": 8.990021705627441, "epoch": 0.13061103421000594, "mean_token_accuracy": 0.7502347230911255, "num_tokens": 7266259.0, "step": 1321, "train/ce_loss": 0.42992448806762695 }, { "epoch": 0.13061103421000594, "step": 1321, "train/sim_loss": 0.06640625 }, { "epoch": 0.13061103421000594, "step": 1321, "train/total_loss": 0.10939870029687881 }, { "entropy": 9.26710319519043, "epoch": 0.13070990705952146, "mean_token_accuracy": 0.815511167049408, "num_tokens": 7271628.0, "step": 1322, "train/ce_loss": 0.37106621265411377 }, { "epoch": 0.13070990705952146, "step": 1322, "train/sim_loss": 0.0625 }, { "epoch": 0.13070990705952146, "step": 1322, "train/total_loss": 0.09960661828517914 }, { "entropy": 9.372325897216797, "epoch": 0.13080877990903697, "mean_token_accuracy": 0.7644736766815186, "num_tokens": 7277005.0, "step": 1323, "train/ce_loss": 1.3307992219924927 }, { "epoch": 0.13080877990903697, "step": 1323, "train/sim_loss": 0.140625 }, { "epoch": 0.13080877990903697, "step": 1323, "train/total_loss": 0.2737049460411072 }, { "entropy": 9.55516242980957, "epoch": 0.1309076527585525, "mean_token_accuracy": 0.7575322985649109, "num_tokens": 7282447.0, "step": 1324, "train/ce_loss": 0.9125069379806519 }, { "epoch": 0.1309076527585525, "step": 1324, "train/sim_loss": 0.1171875 }, { "epoch": 0.1309076527585525, "step": 1324, "train/total_loss": 0.2084381878376007 }, { "entropy": 9.39569091796875, "epoch": 0.13100652560806803, "mean_token_accuracy": 0.7286931872367859, "num_tokens": 7287817.0, "step": 1325, "train/ce_loss": 0.8580958843231201 }, { "epoch": 0.13100652560806803, "step": 1325, "train/sim_loss": 0.0625 }, { "epoch": 0.13100652560806803, "step": 1325, "train/total_loss": 0.148309588432312 }, { "entropy": 8.887149810791016, "epoch": 0.13110539845758354, "mean_token_accuracy": 0.7761852145195007, "num_tokens": 7293390.0, "step": 1326, "train/ce_loss": 0.5178242921829224 }, { "epoch": 0.13110539845758354, "step": 1326, "train/sim_loss": 0.0625 }, { "epoch": 0.13110539845758354, "step": 1326, "train/total_loss": 0.11428242921829224 }, { "entropy": 9.228939056396484, "epoch": 0.13120427130709908, "mean_token_accuracy": 0.6818181872367859, "num_tokens": 7298916.0, "step": 1327, "train/ce_loss": 0.4574768543243408 }, { "epoch": 0.13120427130709908, "step": 1327, "train/sim_loss": 0.0234375 }, { "epoch": 0.13120427130709908, "step": 1327, "train/total_loss": 0.06918518245220184 }, { "entropy": 8.832939147949219, "epoch": 0.1313031441566146, "mean_token_accuracy": 0.6696944832801819, "num_tokens": 7304730.0, "step": 1328, "train/ce_loss": 1.5078606605529785 }, { "epoch": 0.1313031441566146, "step": 1328, "train/sim_loss": 0.0625 }, { "epoch": 0.1313031441566146, "step": 1328, "train/total_loss": 0.21328607201576233 }, { "entropy": 9.302419662475586, "epoch": 0.1314020170061301, "mean_token_accuracy": 0.7668711543083191, "num_tokens": 7310186.0, "step": 1329, "train/ce_loss": 0.733180046081543 }, { "epoch": 0.1314020170061301, "step": 1329, "train/sim_loss": 0.0625 }, { "epoch": 0.1314020170061301, "step": 1329, "train/total_loss": 0.1358180046081543 }, { "entropy": 9.401480674743652, "epoch": 0.13150088985564565, "mean_token_accuracy": 0.7292464971542358, "num_tokens": 7315602.0, "step": 1330, "train/ce_loss": 0.37205106019973755 }, { "epoch": 0.13150088985564565, "step": 1330, "train/sim_loss": 0.06640625 }, { "epoch": 0.13150088985564565, "step": 1330, "train/total_loss": 0.10361135751008987 }, { "entropy": 9.353321075439453, "epoch": 0.13159976270516116, "mean_token_accuracy": 0.8071428537368774, "num_tokens": 7320983.0, "step": 1331, "train/ce_loss": 0.6040170788764954 }, { "epoch": 0.13159976270516116, "step": 1331, "train/sim_loss": 0.1171875 }, { "epoch": 0.13159976270516116, "step": 1331, "train/total_loss": 0.17758920788764954 }, { "entropy": 9.031932830810547, "epoch": 0.13169863555467667, "mean_token_accuracy": 0.6900237798690796, "num_tokens": 7326527.0, "step": 1332, "train/ce_loss": 0.5008515119552612 }, { "epoch": 0.13169863555467667, "step": 1332, "train/sim_loss": 0.125 }, { "epoch": 0.13169863555467667, "step": 1332, "train/total_loss": 0.1750851571559906 }, { "entropy": 9.065460205078125, "epoch": 0.13179750840419222, "mean_token_accuracy": 0.7579162120819092, "num_tokens": 7332135.0, "step": 1333, "train/ce_loss": 1.127722978591919 }, { "epoch": 0.13179750840419222, "step": 1333, "train/sim_loss": 0.125 }, { "epoch": 0.13179750840419222, "step": 1333, "train/total_loss": 0.23777230083942413 }, { "entropy": 8.913217544555664, "epoch": 0.13189638125370773, "mean_token_accuracy": 0.8067542314529419, "num_tokens": 7337844.0, "step": 1334, "train/ce_loss": 0.5623464584350586 }, { "epoch": 0.13189638125370773, "step": 1334, "train/sim_loss": 0.0390625 }, { "epoch": 0.13189638125370773, "step": 1334, "train/total_loss": 0.09529714286327362 }, { "entropy": 9.18136978149414, "epoch": 0.13199525410322324, "mean_token_accuracy": 0.6800000071525574, "num_tokens": 7343372.0, "step": 1335, "train/ce_loss": 0.726325273513794 }, { "epoch": 0.13199525410322324, "step": 1335, "train/sim_loss": 0.09765625 }, { "epoch": 0.13199525410322324, "step": 1335, "train/total_loss": 0.17028877139091492 }, { "entropy": 8.81228256225586, "epoch": 0.13209412695273878, "mean_token_accuracy": 0.7835249304771423, "num_tokens": 7349023.0, "step": 1336, "train/ce_loss": 0.5300569534301758 }, { "epoch": 0.13209412695273878, "step": 1336, "train/sim_loss": 0.1328125 }, { "epoch": 0.13209412695273878, "step": 1336, "train/total_loss": 0.18581819534301758 }, { "entropy": 9.208993911743164, "epoch": 0.1321929998022543, "mean_token_accuracy": 0.7406483888626099, "num_tokens": 7354421.0, "step": 1337, "train/ce_loss": 0.7386722564697266 }, { "epoch": 0.1321929998022543, "step": 1337, "train/sim_loss": 0.1171875 }, { "epoch": 0.1321929998022543, "step": 1337, "train/total_loss": 0.19105473160743713 }, { "entropy": 9.371105194091797, "epoch": 0.13229187265176984, "mean_token_accuracy": 0.7376847267150879, "num_tokens": 7359828.0, "step": 1338, "train/ce_loss": 0.6182477474212646 }, { "epoch": 0.13229187265176984, "step": 1338, "train/sim_loss": 0.09375 }, { "epoch": 0.13229187265176984, "step": 1338, "train/total_loss": 0.155574768781662 }, { "entropy": 9.15401554107666, "epoch": 0.13239074550128535, "mean_token_accuracy": 0.7979910969734192, "num_tokens": 7365395.0, "step": 1339, "train/ce_loss": 0.5902302861213684 }, { "epoch": 0.13239074550128535, "step": 1339, "train/sim_loss": 0.05859375 }, { "epoch": 0.13239074550128535, "step": 1339, "train/total_loss": 0.11761678010225296 }, { "epoch": 0.13248961835080086, "grad_norm": 0.865107536315918, "learning_rate": 9.671413736834299e-06, "loss": 0.1614, "step": 1340 }, { "entropy": 9.356657028198242, "epoch": 0.13248961835080086, "mean_token_accuracy": 0.714631199836731, "num_tokens": 7370777.0, "step": 1340, "train/ce_loss": 1.1644408702850342 }, { "epoch": 0.13248961835080086, "step": 1340, "train/sim_loss": 0.046875 }, { "epoch": 0.13248961835080086, "step": 1340, "train/total_loss": 0.16331908106803894 }, { "entropy": 9.333028793334961, "epoch": 0.1325884912003164, "mean_token_accuracy": 0.7444831728935242, "num_tokens": 7376124.0, "step": 1341, "train/ce_loss": 1.1876604557037354 }, { "epoch": 0.1325884912003164, "step": 1341, "train/sim_loss": 0.09375 }, { "epoch": 0.1325884912003164, "step": 1341, "train/total_loss": 0.21251603960990906 }, { "entropy": 9.080362319946289, "epoch": 0.13268736404983192, "mean_token_accuracy": 0.70659339427948, "num_tokens": 7381686.0, "step": 1342, "train/ce_loss": 1.082200527191162 }, { "epoch": 0.13268736404983192, "step": 1342, "train/sim_loss": 0.078125 }, { "epoch": 0.13268736404983192, "step": 1342, "train/total_loss": 0.18634505569934845 }, { "entropy": 9.276814460754395, "epoch": 0.13278623689934743, "mean_token_accuracy": 0.726681113243103, "num_tokens": 7387169.0, "step": 1343, "train/ce_loss": 0.45044034719467163 }, { "epoch": 0.13278623689934743, "step": 1343, "train/sim_loss": 0.14453125 }, { "epoch": 0.13278623689934743, "step": 1343, "train/total_loss": 0.18957528471946716 }, { "entropy": 9.112545013427734, "epoch": 0.13288510974886297, "mean_token_accuracy": 0.7462121248245239, "num_tokens": 7392780.0, "step": 1344, "train/ce_loss": 1.5367382764816284 }, { "epoch": 0.13288510974886297, "step": 1344, "train/sim_loss": 0.0625 }, { "epoch": 0.13288510974886297, "step": 1344, "train/total_loss": 0.21617382764816284 }, { "entropy": 9.251157760620117, "epoch": 0.1329839825983785, "mean_token_accuracy": 0.7614781856536865, "num_tokens": 7398325.0, "step": 1345, "train/ce_loss": 0.7116100192070007 }, { "epoch": 0.1329839825983785, "step": 1345, "train/sim_loss": 0.02734375 }, { "epoch": 0.1329839825983785, "step": 1345, "train/total_loss": 0.09850475192070007 }, { "entropy": 9.623586654663086, "epoch": 0.133082855447894, "mean_token_accuracy": 0.7767584323883057, "num_tokens": 7403596.0, "step": 1346, "train/ce_loss": 0.8847566843032837 }, { "epoch": 0.133082855447894, "step": 1346, "train/sim_loss": 0.1015625 }, { "epoch": 0.133082855447894, "step": 1346, "train/total_loss": 0.19003817439079285 }, { "entropy": 9.650965690612793, "epoch": 0.13318172829740954, "mean_token_accuracy": 0.7125382423400879, "num_tokens": 7408848.0, "step": 1347, "train/ce_loss": 0.7470201849937439 }, { "epoch": 0.13318172829740954, "step": 1347, "train/sim_loss": 0.0703125 }, { "epoch": 0.13318172829740954, "step": 1347, "train/total_loss": 0.14501452445983887 }, { "entropy": 9.417610168457031, "epoch": 0.13328060114692505, "mean_token_accuracy": 0.7610965967178345, "num_tokens": 7414254.0, "step": 1348, "train/ce_loss": 0.7492644190788269 }, { "epoch": 0.13328060114692505, "step": 1348, "train/sim_loss": 0.0859375 }, { "epoch": 0.13328060114692505, "step": 1348, "train/total_loss": 0.1608639359474182 }, { "entropy": 9.690401077270508, "epoch": 0.13337947399644057, "mean_token_accuracy": 0.7008000016212463, "num_tokens": 7419443.0, "step": 1349, "train/ce_loss": 0.8859387636184692 }, { "epoch": 0.13337947399644057, "step": 1349, "train/sim_loss": 0.05078125 }, { "epoch": 0.13337947399644057, "step": 1349, "train/total_loss": 0.13937512040138245 }, { "entropy": 9.333337783813477, "epoch": 0.1334783468459561, "mean_token_accuracy": 0.7512626051902771, "num_tokens": 7424841.0, "step": 1350, "train/ce_loss": 1.094757080078125 }, { "epoch": 0.1334783468459561, "step": 1350, "train/sim_loss": 0.12109375 }, { "epoch": 0.1334783468459561, "step": 1350, "train/total_loss": 0.23056945204734802 }, { "entropy": 9.180792808532715, "epoch": 0.13357721969547162, "mean_token_accuracy": 0.7876344323158264, "num_tokens": 7430244.0, "step": 1351, "train/ce_loss": 0.9078308343887329 }, { "epoch": 0.13357721969547162, "step": 1351, "train/sim_loss": 0.09375 }, { "epoch": 0.13357721969547162, "step": 1351, "train/total_loss": 0.18453308939933777 }, { "entropy": 9.242652893066406, "epoch": 0.13367609254498714, "mean_token_accuracy": 0.7193763852119446, "num_tokens": 7435718.0, "step": 1352, "train/ce_loss": 1.2068300247192383 }, { "epoch": 0.13367609254498714, "step": 1352, "train/sim_loss": 0.09375 }, { "epoch": 0.13367609254498714, "step": 1352, "train/total_loss": 0.21443301439285278 }, { "entropy": 9.617881774902344, "epoch": 0.13377496539450268, "mean_token_accuracy": 0.7196531891822815, "num_tokens": 7440998.0, "step": 1353, "train/ce_loss": 1.1029118299484253 }, { "epoch": 0.13377496539450268, "step": 1353, "train/sim_loss": 0.125 }, { "epoch": 0.13377496539450268, "step": 1353, "train/total_loss": 0.23529118299484253 }, { "entropy": 9.207983016967773, "epoch": 0.1338738382440182, "mean_token_accuracy": 0.6746987700462341, "num_tokens": 7446581.0, "step": 1354, "train/ce_loss": 1.6574394702911377 }, { "epoch": 0.1338738382440182, "step": 1354, "train/sim_loss": 0.05078125 }, { "epoch": 0.1338738382440182, "step": 1354, "train/total_loss": 0.21652519702911377 }, { "entropy": 9.364320755004883, "epoch": 0.1339727110935337, "mean_token_accuracy": 0.758323073387146, "num_tokens": 7452056.0, "step": 1355, "train/ce_loss": 0.5007727146148682 }, { "epoch": 0.1339727110935337, "step": 1355, "train/sim_loss": 0.12109375 }, { "epoch": 0.1339727110935337, "step": 1355, "train/total_loss": 0.17117102444171906 }, { "entropy": 8.796595573425293, "epoch": 0.13407158394304924, "mean_token_accuracy": 0.7227272987365723, "num_tokens": 7458009.0, "step": 1356, "train/ce_loss": 1.0730032920837402 }, { "epoch": 0.13407158394304924, "step": 1356, "train/sim_loss": 0.13671875 }, { "epoch": 0.13407158394304924, "step": 1356, "train/total_loss": 0.24401909112930298 }, { "entropy": 9.506914138793945, "epoch": 0.13417045679256476, "mean_token_accuracy": 0.7208994626998901, "num_tokens": 7463351.0, "step": 1357, "train/ce_loss": 0.9925903081893921 }, { "epoch": 0.13417045679256476, "step": 1357, "train/sim_loss": 0.07421875 }, { "epoch": 0.13417045679256476, "step": 1357, "train/total_loss": 0.17347778379917145 }, { "entropy": 9.468799591064453, "epoch": 0.1342693296420803, "mean_token_accuracy": 0.709876537322998, "num_tokens": 7468774.0, "step": 1358, "train/ce_loss": 0.6264256238937378 }, { "epoch": 0.1342693296420803, "step": 1358, "train/sim_loss": 0.1015625 }, { "epoch": 0.1342693296420803, "step": 1358, "train/total_loss": 0.16420507431030273 }, { "entropy": 9.243589401245117, "epoch": 0.1343682024915958, "mean_token_accuracy": 0.7047738432884216, "num_tokens": 7474164.0, "step": 1359, "train/ce_loss": 0.8132681250572205 }, { "epoch": 0.1343682024915958, "step": 1359, "train/sim_loss": 0.0859375 }, { "epoch": 0.1343682024915958, "step": 1359, "train/total_loss": 0.16726431250572205 }, { "epoch": 0.13446707534111133, "grad_norm": 1.1508452892303467, "learning_rate": 9.66646887207635e-06, "loss": 0.1675, "step": 1360 }, { "entropy": 9.295428276062012, "epoch": 0.13446707534111133, "mean_token_accuracy": 0.7433217167854309, "num_tokens": 7479626.0, "step": 1360, "train/ce_loss": 0.353597491979599 }, { "epoch": 0.13446707534111133, "step": 1360, "train/sim_loss": 0.06640625 }, { "epoch": 0.13446707534111133, "step": 1360, "train/total_loss": 0.10176600515842438 }, { "entropy": 9.055641174316406, "epoch": 0.13456594819062687, "mean_token_accuracy": 0.7641083598136902, "num_tokens": 7485217.0, "step": 1361, "train/ce_loss": 1.073459267616272 }, { "epoch": 0.13456594819062687, "step": 1361, "train/sim_loss": 0.06640625 }, { "epoch": 0.13456594819062687, "step": 1361, "train/total_loss": 0.17375218868255615 }, { "entropy": 8.795032501220703, "epoch": 0.13466482104014238, "mean_token_accuracy": 0.7438330054283142, "num_tokens": 7490867.0, "step": 1362, "train/ce_loss": 1.178248405456543 }, { "epoch": 0.13466482104014238, "step": 1362, "train/sim_loss": 0.1015625 }, { "epoch": 0.13466482104014238, "step": 1362, "train/total_loss": 0.21938735246658325 }, { "entropy": 9.72520637512207, "epoch": 0.1347636938896579, "mean_token_accuracy": 0.777265727519989, "num_tokens": 7496033.0, "step": 1363, "train/ce_loss": 0.4659506380558014 }, { "epoch": 0.1347636938896579, "step": 1363, "train/sim_loss": 0.03125 }, { "epoch": 0.1347636938896579, "step": 1363, "train/total_loss": 0.07784506678581238 }, { "entropy": 9.370145797729492, "epoch": 0.13486256673917343, "mean_token_accuracy": 0.7825567722320557, "num_tokens": 7501510.0, "step": 1364, "train/ce_loss": 0.5745517015457153 }, { "epoch": 0.13486256673917343, "step": 1364, "train/sim_loss": 0.08203125 }, { "epoch": 0.13486256673917343, "step": 1364, "train/total_loss": 0.1394864171743393 }, { "entropy": 9.465887069702148, "epoch": 0.13496143958868895, "mean_token_accuracy": 0.7103538513183594, "num_tokens": 7506905.0, "step": 1365, "train/ce_loss": 0.7180687189102173 }, { "epoch": 0.13496143958868895, "step": 1365, "train/sim_loss": 0.12890625 }, { "epoch": 0.13496143958868895, "step": 1365, "train/total_loss": 0.2007131278514862 }, { "entropy": 9.046463012695312, "epoch": 0.13506031243820446, "mean_token_accuracy": 0.7139874696731567, "num_tokens": 7512631.0, "step": 1366, "train/ce_loss": 1.2173186540603638 }, { "epoch": 0.13506031243820446, "step": 1366, "train/sim_loss": 0.15625 }, { "epoch": 0.13506031243820446, "step": 1366, "train/total_loss": 0.27798187732696533 }, { "entropy": 9.740477561950684, "epoch": 0.13515918528772, "mean_token_accuracy": 0.7755101919174194, "num_tokens": 7517874.0, "step": 1367, "train/ce_loss": 0.4832552373409271 }, { "epoch": 0.13515918528772, "step": 1367, "train/sim_loss": 0.02734375 }, { "epoch": 0.13515918528772, "step": 1367, "train/total_loss": 0.07566927373409271 }, { "entropy": 9.268457412719727, "epoch": 0.13525805813723552, "mean_token_accuracy": 0.7414805889129639, "num_tokens": 7523331.0, "step": 1368, "train/ce_loss": 0.8880173563957214 }, { "epoch": 0.13525805813723552, "step": 1368, "train/sim_loss": 0.08984375 }, { "epoch": 0.13525805813723552, "step": 1368, "train/total_loss": 0.17864549160003662 }, { "entropy": 9.221433639526367, "epoch": 0.13535693098675103, "mean_token_accuracy": 0.7874186635017395, "num_tokens": 7528829.0, "step": 1369, "train/ce_loss": 0.46327099204063416 }, { "epoch": 0.13535693098675103, "step": 1369, "train/sim_loss": 0.0859375 }, { "epoch": 0.13535693098675103, "step": 1369, "train/total_loss": 0.13226459920406342 }, { "entropy": 9.39745044708252, "epoch": 0.13545580383626657, "mean_token_accuracy": 0.7734103798866272, "num_tokens": 7534476.0, "step": 1370, "train/ce_loss": 0.5287767052650452 }, { "epoch": 0.13545580383626657, "step": 1370, "train/sim_loss": 0.03125 }, { "epoch": 0.13545580383626657, "step": 1370, "train/total_loss": 0.08412767201662064 }, { "entropy": 9.383447647094727, "epoch": 0.13555467668578208, "mean_token_accuracy": 0.6947115659713745, "num_tokens": 7539953.0, "step": 1371, "train/ce_loss": 0.7998374104499817 }, { "epoch": 0.13555467668578208, "step": 1371, "train/sim_loss": 0.11328125 }, { "epoch": 0.13555467668578208, "step": 1371, "train/total_loss": 0.19326499104499817 }, { "entropy": 9.263409614562988, "epoch": 0.1356535495352976, "mean_token_accuracy": 0.7381864786148071, "num_tokens": 7545292.0, "step": 1372, "train/ce_loss": 0.31506991386413574 }, { "epoch": 0.1356535495352976, "step": 1372, "train/sim_loss": 0.09765625 }, { "epoch": 0.1356535495352976, "step": 1372, "train/total_loss": 0.1291632354259491 }, { "entropy": 8.942586898803711, "epoch": 0.13575242238481314, "mean_token_accuracy": 0.7604630589485168, "num_tokens": 7550981.0, "step": 1373, "train/ce_loss": 0.6923180222511292 }, { "epoch": 0.13575242238481314, "step": 1373, "train/sim_loss": 0.06640625 }, { "epoch": 0.13575242238481314, "step": 1373, "train/total_loss": 0.1356380581855774 }, { "entropy": 9.435749053955078, "epoch": 0.13585129523432865, "mean_token_accuracy": 0.7278401851654053, "num_tokens": 7556386.0, "step": 1374, "train/ce_loss": 0.8412852883338928 }, { "epoch": 0.13585129523432865, "step": 1374, "train/sim_loss": 0.08203125 }, { "epoch": 0.13585129523432865, "step": 1374, "train/total_loss": 0.16615977883338928 }, { "entropy": 9.0076322555542, "epoch": 0.13595016808384416, "mean_token_accuracy": 0.7039711475372314, "num_tokens": 7562093.0, "step": 1375, "train/ce_loss": 1.3659143447875977 }, { "epoch": 0.13595016808384416, "step": 1375, "train/sim_loss": 0.15625 }, { "epoch": 0.13595016808384416, "step": 1375, "train/total_loss": 0.29284143447875977 }, { "entropy": 9.064604759216309, "epoch": 0.1360490409333597, "mean_token_accuracy": 0.715976357460022, "num_tokens": 7567654.0, "step": 1376, "train/ce_loss": 1.3176347017288208 }, { "epoch": 0.1360490409333597, "step": 1376, "train/sim_loss": 0.0859375 }, { "epoch": 0.1360490409333597, "step": 1376, "train/total_loss": 0.21770097315311432 }, { "entropy": 9.256625175476074, "epoch": 0.13614791378287522, "mean_token_accuracy": 0.6576470732688904, "num_tokens": 7573138.0, "step": 1377, "train/ce_loss": 0.8395988345146179 }, { "epoch": 0.13614791378287522, "step": 1377, "train/sim_loss": 0.09375 }, { "epoch": 0.13614791378287522, "step": 1377, "train/total_loss": 0.17770987749099731 }, { "entropy": 9.103404998779297, "epoch": 0.13624678663239073, "mean_token_accuracy": 0.7629547715187073, "num_tokens": 7578605.0, "step": 1378, "train/ce_loss": 0.6307790279388428 }, { "epoch": 0.13624678663239073, "step": 1378, "train/sim_loss": 0.02734375 }, { "epoch": 0.13624678663239073, "step": 1378, "train/total_loss": 0.0904216542840004 }, { "entropy": 9.181008338928223, "epoch": 0.13634565948190627, "mean_token_accuracy": 0.7228260636329651, "num_tokens": 7584131.0, "step": 1379, "train/ce_loss": 0.7328585386276245 }, { "epoch": 0.13634565948190627, "step": 1379, "train/sim_loss": 0.06640625 }, { "epoch": 0.13634565948190627, "step": 1379, "train/total_loss": 0.13969209790229797 }, { "epoch": 0.1364445323314218, "grad_norm": 1.0652365684509277, "learning_rate": 9.6615240073184e-06, "loss": 0.1656, "step": 1380 }, { "entropy": 9.197894096374512, "epoch": 0.1364445323314218, "mean_token_accuracy": 0.7525539398193359, "num_tokens": 7589642.0, "step": 1380, "train/ce_loss": 0.5968760251998901 }, { "epoch": 0.1364445323314218, "step": 1380, "train/sim_loss": 0.0390625 }, { "epoch": 0.1364445323314218, "step": 1380, "train/total_loss": 0.09875009953975677 }, { "entropy": 9.21841049194336, "epoch": 0.13654340518093733, "mean_token_accuracy": 0.7220512628555298, "num_tokens": 7595225.0, "step": 1381, "train/ce_loss": 0.7903860807418823 }, { "epoch": 0.13654340518093733, "step": 1381, "train/sim_loss": 0.0390625 }, { "epoch": 0.13654340518093733, "step": 1381, "train/total_loss": 0.11810111254453659 }, { "entropy": 9.421442031860352, "epoch": 0.13664227803045284, "mean_token_accuracy": 0.7539203763008118, "num_tokens": 7600624.0, "step": 1382, "train/ce_loss": 0.6076535582542419 }, { "epoch": 0.13664227803045284, "step": 1382, "train/sim_loss": 0.11328125 }, { "epoch": 0.13664227803045284, "step": 1382, "train/total_loss": 0.1740466058254242 }, { "entropy": 8.836116790771484, "epoch": 0.13674115087996835, "mean_token_accuracy": 0.6913896203041077, "num_tokens": 7606581.0, "step": 1383, "train/ce_loss": 0.5354176163673401 }, { "epoch": 0.13674115087996835, "step": 1383, "train/sim_loss": 0.0859375 }, { "epoch": 0.13674115087996835, "step": 1383, "train/total_loss": 0.13947926461696625 }, { "entropy": 9.139223098754883, "epoch": 0.1368400237294839, "mean_token_accuracy": 0.7829787135124207, "num_tokens": 7612190.0, "step": 1384, "train/ce_loss": 1.1583524942398071 }, { "epoch": 0.1368400237294839, "step": 1384, "train/sim_loss": 0.0859375 }, { "epoch": 0.1368400237294839, "step": 1384, "train/total_loss": 0.2017727494239807 }, { "entropy": 9.34244441986084, "epoch": 0.1369388965789994, "mean_token_accuracy": 0.767106831073761, "num_tokens": 7617627.0, "step": 1385, "train/ce_loss": 0.6480108499526978 }, { "epoch": 0.1369388965789994, "step": 1385, "train/sim_loss": 0.0546875 }, { "epoch": 0.1369388965789994, "step": 1385, "train/total_loss": 0.11948858946561813 }, { "entropy": 9.526705741882324, "epoch": 0.13703776942851492, "mean_token_accuracy": 0.7700534462928772, "num_tokens": 7622983.0, "step": 1386, "train/ce_loss": 0.5881717205047607 }, { "epoch": 0.13703776942851492, "step": 1386, "train/sim_loss": 0.08203125 }, { "epoch": 0.13703776942851492, "step": 1386, "train/total_loss": 0.14084842801094055 }, { "entropy": 9.209314346313477, "epoch": 0.13713664227803046, "mean_token_accuracy": 0.7077720165252686, "num_tokens": 7628434.0, "step": 1387, "train/ce_loss": 0.7670530080795288 }, { "epoch": 0.13713664227803046, "step": 1387, "train/sim_loss": 0.09765625 }, { "epoch": 0.13713664227803046, "step": 1387, "train/total_loss": 0.17436155676841736 }, { "entropy": 9.06583309173584, "epoch": 0.13723551512754598, "mean_token_accuracy": 0.7211538553237915, "num_tokens": 7634017.0, "step": 1388, "train/ce_loss": 1.0072404146194458 }, { "epoch": 0.13723551512754598, "step": 1388, "train/sim_loss": 0.1953125 }, { "epoch": 0.13723551512754598, "step": 1388, "train/total_loss": 0.2960365414619446 }, { "entropy": 9.323302268981934, "epoch": 0.1373343879770615, "mean_token_accuracy": 0.7563218474388123, "num_tokens": 7639471.0, "step": 1389, "train/ce_loss": 0.645159125328064 }, { "epoch": 0.1373343879770615, "step": 1389, "train/sim_loss": 0.0625 }, { "epoch": 0.1373343879770615, "step": 1389, "train/total_loss": 0.12701591849327087 }, { "entropy": 9.372289657592773, "epoch": 0.13743326082657703, "mean_token_accuracy": 0.7139334082603455, "num_tokens": 7644919.0, "step": 1390, "train/ce_loss": 1.3982884883880615 }, { "epoch": 0.13743326082657703, "step": 1390, "train/sim_loss": 0.1328125 }, { "epoch": 0.13743326082657703, "step": 1390, "train/total_loss": 0.2726413607597351 }, { "entropy": 9.312299728393555, "epoch": 0.13753213367609254, "mean_token_accuracy": 0.6748251914978027, "num_tokens": 7650462.0, "step": 1391, "train/ce_loss": 1.158392310142517 }, { "epoch": 0.13753213367609254, "step": 1391, "train/sim_loss": 0.13671875 }, { "epoch": 0.13753213367609254, "step": 1391, "train/total_loss": 0.25255799293518066 }, { "entropy": 9.139798164367676, "epoch": 0.13763100652560806, "mean_token_accuracy": 0.7587268948554993, "num_tokens": 7656115.0, "step": 1392, "train/ce_loss": 0.49836188554763794 }, { "epoch": 0.13763100652560806, "step": 1392, "train/sim_loss": 0.08984375 }, { "epoch": 0.13763100652560806, "step": 1392, "train/total_loss": 0.1396799385547638 }, { "entropy": 9.221004486083984, "epoch": 0.1377298793751236, "mean_token_accuracy": 0.7438105344772339, "num_tokens": 7661608.0, "step": 1393, "train/ce_loss": 1.0505791902542114 }, { "epoch": 0.1377298793751236, "step": 1393, "train/sim_loss": 0.1171875 }, { "epoch": 0.1377298793751236, "step": 1393, "train/total_loss": 0.22224542498588562 }, { "entropy": 8.834049224853516, "epoch": 0.1378287522246391, "mean_token_accuracy": 0.6942059993743896, "num_tokens": 7667211.0, "step": 1394, "train/ce_loss": 1.0289504528045654 }, { "epoch": 0.1378287522246391, "step": 1394, "train/sim_loss": 0.08203125 }, { "epoch": 0.1378287522246391, "step": 1394, "train/total_loss": 0.18492630124092102 }, { "entropy": 9.098838806152344, "epoch": 0.13792762507415463, "mean_token_accuracy": 0.7338129281997681, "num_tokens": 7672815.0, "step": 1395, "train/ce_loss": 0.5848605036735535 }, { "epoch": 0.13792762507415463, "step": 1395, "train/sim_loss": 0.0390625 }, { "epoch": 0.13792762507415463, "step": 1395, "train/total_loss": 0.09754855185747147 }, { "entropy": 9.302263259887695, "epoch": 0.13802649792367017, "mean_token_accuracy": 0.753333330154419, "num_tokens": 7678178.0, "step": 1396, "train/ce_loss": 1.0331591367721558 }, { "epoch": 0.13802649792367017, "step": 1396, "train/sim_loss": 0.09375 }, { "epoch": 0.13802649792367017, "step": 1396, "train/total_loss": 0.19706591963768005 }, { "entropy": 9.258892059326172, "epoch": 0.13812537077318568, "mean_token_accuracy": 0.7100939154624939, "num_tokens": 7683772.0, "step": 1397, "train/ce_loss": 0.6100711226463318 }, { "epoch": 0.13812537077318568, "step": 1397, "train/sim_loss": 0.1171875 }, { "epoch": 0.13812537077318568, "step": 1397, "train/total_loss": 0.17819461226463318 }, { "entropy": 9.02194881439209, "epoch": 0.1382242436227012, "mean_token_accuracy": 0.7407024502754211, "num_tokens": 7689390.0, "step": 1398, "train/ce_loss": 0.6515518426895142 }, { "epoch": 0.1382242436227012, "step": 1398, "train/sim_loss": 0.109375 }, { "epoch": 0.1382242436227012, "step": 1398, "train/total_loss": 0.17453017830848694 }, { "entropy": 9.026273727416992, "epoch": 0.13832311647221673, "mean_token_accuracy": 0.736289381980896, "num_tokens": 7694848.0, "step": 1399, "train/ce_loss": 1.1290706396102905 }, { "epoch": 0.13832311647221673, "step": 1399, "train/sim_loss": 0.078125 }, { "epoch": 0.13832311647221673, "step": 1399, "train/total_loss": 0.1910320669412613 }, { "epoch": 0.13842198932173225, "grad_norm": 0.8544314503669739, "learning_rate": 9.656579142560452e-06, "loss": 0.1669, "step": 1400 }, { "entropy": 9.41135025024414, "epoch": 0.13842198932173225, "mean_token_accuracy": 0.7695418000221252, "num_tokens": 7700245.0, "step": 1400, "train/ce_loss": 0.8614939451217651 }, { "epoch": 0.13842198932173225, "step": 1400, "train/sim_loss": 0.0859375 }, { "epoch": 0.13842198932173225, "step": 1400, "train/total_loss": 0.1720868945121765 }, { "entropy": 8.775728225708008, "epoch": 0.1385208621712478, "mean_token_accuracy": 0.7021439671516418, "num_tokens": 7706048.0, "step": 1401, "train/ce_loss": 1.3034549951553345 }, { "epoch": 0.1385208621712478, "step": 1401, "train/sim_loss": 0.0625 }, { "epoch": 0.1385208621712478, "step": 1401, "train/total_loss": 0.19284550845623016 }, { "entropy": 9.531765937805176, "epoch": 0.1386197350207633, "mean_token_accuracy": 0.7052932977676392, "num_tokens": 7711329.0, "step": 1402, "train/ce_loss": 0.8024470806121826 }, { "epoch": 0.1386197350207633, "step": 1402, "train/sim_loss": 0.12109375 }, { "epoch": 0.1386197350207633, "step": 1402, "train/total_loss": 0.20133846998214722 }, { "entropy": 9.188678741455078, "epoch": 0.13871860787027882, "mean_token_accuracy": 0.6964679956436157, "num_tokens": 7716876.0, "step": 1403, "train/ce_loss": 0.3704219460487366 }, { "epoch": 0.13871860787027882, "step": 1403, "train/sim_loss": 0.04296875 }, { "epoch": 0.13871860787027882, "step": 1403, "train/total_loss": 0.08001095056533813 }, { "entropy": 9.382659912109375, "epoch": 0.13881748071979436, "mean_token_accuracy": 0.6678921580314636, "num_tokens": 7722535.0, "step": 1404, "train/ce_loss": 1.2192801237106323 }, { "epoch": 0.13881748071979436, "step": 1404, "train/sim_loss": 0.05859375 }, { "epoch": 0.13881748071979436, "step": 1404, "train/total_loss": 0.18052175641059875 }, { "entropy": 9.03492546081543, "epoch": 0.13891635356930987, "mean_token_accuracy": 0.8057079911231995, "num_tokens": 7728077.0, "step": 1405, "train/ce_loss": 0.5457018613815308 }, { "epoch": 0.13891635356930987, "step": 1405, "train/sim_loss": 0.078125 }, { "epoch": 0.13891635356930987, "step": 1405, "train/total_loss": 0.13269518315792084 }, { "entropy": 9.08015251159668, "epoch": 0.13901522641882538, "mean_token_accuracy": 0.7750533223152161, "num_tokens": 7733551.0, "step": 1406, "train/ce_loss": 0.7104584574699402 }, { "epoch": 0.13901522641882538, "step": 1406, "train/sim_loss": 0.09375 }, { "epoch": 0.13901522641882538, "step": 1406, "train/total_loss": 0.16479584574699402 }, { "entropy": 9.286920547485352, "epoch": 0.13911409926834092, "mean_token_accuracy": 0.7522464990615845, "num_tokens": 7738928.0, "step": 1407, "train/ce_loss": 0.5630143880844116 }, { "epoch": 0.13911409926834092, "step": 1407, "train/sim_loss": 0.03125 }, { "epoch": 0.13911409926834092, "step": 1407, "train/total_loss": 0.08755144476890564 }, { "entropy": 9.539969444274902, "epoch": 0.13921297211785644, "mean_token_accuracy": 0.7270155549049377, "num_tokens": 7744194.0, "step": 1408, "train/ce_loss": 1.1761554479599 }, { "epoch": 0.13921297211785644, "step": 1408, "train/sim_loss": 0.12109375 }, { "epoch": 0.13921297211785644, "step": 1408, "train/total_loss": 0.23870930075645447 }, { "entropy": 9.258171081542969, "epoch": 0.13931184496737195, "mean_token_accuracy": 0.6738350987434387, "num_tokens": 7749649.0, "step": 1409, "train/ce_loss": 0.7204530239105225 }, { "epoch": 0.13931184496737195, "step": 1409, "train/sim_loss": 0.07421875 }, { "epoch": 0.13931184496737195, "step": 1409, "train/total_loss": 0.14626404643058777 }, { "entropy": 8.982789993286133, "epoch": 0.1394107178168875, "mean_token_accuracy": 0.7942088842391968, "num_tokens": 7755256.0, "step": 1410, "train/ce_loss": 0.5464046597480774 }, { "epoch": 0.1394107178168875, "step": 1410, "train/sim_loss": 0.03125 }, { "epoch": 0.1394107178168875, "step": 1410, "train/total_loss": 0.08589047193527222 }, { "entropy": 8.940760612487793, "epoch": 0.139509590666403, "mean_token_accuracy": 0.7704545259475708, "num_tokens": 7760768.0, "step": 1411, "train/ce_loss": 0.6209253668785095 }, { "epoch": 0.139509590666403, "step": 1411, "train/sim_loss": 0.01953125 }, { "epoch": 0.139509590666403, "step": 1411, "train/total_loss": 0.08162379264831543 }, { "entropy": 9.295748710632324, "epoch": 0.13960846351591852, "mean_token_accuracy": 0.6844207644462585, "num_tokens": 7766081.0, "step": 1412, "train/ce_loss": 1.3952964544296265 }, { "epoch": 0.13960846351591852, "step": 1412, "train/sim_loss": 0.08984375 }, { "epoch": 0.13960846351591852, "step": 1412, "train/total_loss": 0.22937339544296265 }, { "entropy": 9.228604316711426, "epoch": 0.13970733636543406, "mean_token_accuracy": 0.744429886341095, "num_tokens": 7771490.0, "step": 1413, "train/ce_loss": 0.8458226323127747 }, { "epoch": 0.13970733636543406, "step": 1413, "train/sim_loss": 0.07421875 }, { "epoch": 0.13970733636543406, "step": 1413, "train/total_loss": 0.15880101919174194 }, { "entropy": 9.014253616333008, "epoch": 0.13980620921494957, "mean_token_accuracy": 0.7424400448799133, "num_tokens": 7777076.0, "step": 1414, "train/ce_loss": 1.1695232391357422 }, { "epoch": 0.13980620921494957, "step": 1414, "train/sim_loss": 0.08203125 }, { "epoch": 0.13980620921494957, "step": 1414, "train/total_loss": 0.1989835798740387 }, { "entropy": 8.982006072998047, "epoch": 0.13990508206446509, "mean_token_accuracy": 0.7450157403945923, "num_tokens": 7782684.0, "step": 1415, "train/ce_loss": 0.7133921980857849 }, { "epoch": 0.13990508206446509, "step": 1415, "train/sim_loss": 0.04296875 }, { "epoch": 0.13990508206446509, "step": 1415, "train/total_loss": 0.11430796980857849 }, { "entropy": 8.99738883972168, "epoch": 0.14000395491398063, "mean_token_accuracy": 0.708053708076477, "num_tokens": 7788249.0, "step": 1416, "train/ce_loss": 1.021850347518921 }, { "epoch": 0.14000395491398063, "step": 1416, "train/sim_loss": 0.13671875 }, { "epoch": 0.14000395491398063, "step": 1416, "train/total_loss": 0.23890379071235657 }, { "entropy": 9.044642448425293, "epoch": 0.14010282776349614, "mean_token_accuracy": 0.7830396294593811, "num_tokens": 7793836.0, "step": 1417, "train/ce_loss": 0.6514425873756409 }, { "epoch": 0.14010282776349614, "step": 1417, "train/sim_loss": 0.0390625 }, { "epoch": 0.14010282776349614, "step": 1417, "train/total_loss": 0.10420676320791245 }, { "entropy": 9.367036819458008, "epoch": 0.14020170061301165, "mean_token_accuracy": 0.8035961389541626, "num_tokens": 7799195.0, "step": 1418, "train/ce_loss": 0.7320546507835388 }, { "epoch": 0.14020170061301165, "step": 1418, "train/sim_loss": 0.05078125 }, { "epoch": 0.14020170061301165, "step": 1418, "train/total_loss": 0.12398671358823776 }, { "entropy": 9.123520851135254, "epoch": 0.1403005734625272, "mean_token_accuracy": 0.7703016400337219, "num_tokens": 7804702.0, "step": 1419, "train/ce_loss": 0.7747244834899902 }, { "epoch": 0.1403005734625272, "step": 1419, "train/sim_loss": 0.078125 }, { "epoch": 0.1403005734625272, "step": 1419, "train/total_loss": 0.15559744834899902 }, { "epoch": 0.1403994463120427, "grad_norm": 0.8425388336181641, "learning_rate": 9.651634277802503e-06, "loss": 0.1628, "step": 1420 }, { "entropy": 9.08584976196289, "epoch": 0.1403994463120427, "mean_token_accuracy": 0.7810781002044678, "num_tokens": 7810182.0, "step": 1420, "train/ce_loss": 0.7497536540031433 }, { "epoch": 0.1403994463120427, "step": 1420, "train/sim_loss": 0.0625 }, { "epoch": 0.1403994463120427, "step": 1420, "train/total_loss": 0.1374753713607788 }, { "entropy": 9.162875175476074, "epoch": 0.14049831916155825, "mean_token_accuracy": 0.7530589699745178, "num_tokens": 7815683.0, "step": 1421, "train/ce_loss": 1.1598056554794312 }, { "epoch": 0.14049831916155825, "step": 1421, "train/sim_loss": 0.0859375 }, { "epoch": 0.14049831916155825, "step": 1421, "train/total_loss": 0.20191806554794312 }, { "entropy": 9.262896537780762, "epoch": 0.14059719201107376, "mean_token_accuracy": 0.772675096988678, "num_tokens": 7821132.0, "step": 1422, "train/ce_loss": 0.5230606198310852 }, { "epoch": 0.14059719201107376, "step": 1422, "train/sim_loss": 0.0625 }, { "epoch": 0.14059719201107376, "step": 1422, "train/total_loss": 0.11480606347322464 }, { "entropy": 8.908860206604004, "epoch": 0.14069606486058928, "mean_token_accuracy": 0.7246804237365723, "num_tokens": 7826773.0, "step": 1423, "train/ce_loss": 0.8518029451370239 }, { "epoch": 0.14069606486058928, "step": 1423, "train/sim_loss": 0.0625 }, { "epoch": 0.14069606486058928, "step": 1423, "train/total_loss": 0.14768029749393463 }, { "entropy": 9.264589309692383, "epoch": 0.14079493771010482, "mean_token_accuracy": 0.7203107476234436, "num_tokens": 7832319.0, "step": 1424, "train/ce_loss": 1.053589105606079 }, { "epoch": 0.14079493771010482, "step": 1424, "train/sim_loss": 0.08984375 }, { "epoch": 0.14079493771010482, "step": 1424, "train/total_loss": 0.19520266354084015 }, { "entropy": 9.2810640335083, "epoch": 0.14089381055962033, "mean_token_accuracy": 0.747586190700531, "num_tokens": 7837723.0, "step": 1425, "train/ce_loss": 0.5492417812347412 }, { "epoch": 0.14089381055962033, "step": 1425, "train/sim_loss": 0.078125 }, { "epoch": 0.14089381055962033, "step": 1425, "train/total_loss": 0.13304917514324188 }, { "entropy": 9.068655014038086, "epoch": 0.14099268340913584, "mean_token_accuracy": 0.7541152238845825, "num_tokens": 7843339.0, "step": 1426, "train/ce_loss": 1.0589027404785156 }, { "epoch": 0.14099268340913584, "step": 1426, "train/sim_loss": 0.078125 }, { "epoch": 0.14099268340913584, "step": 1426, "train/total_loss": 0.18401527404785156 }, { "entropy": 9.062170028686523, "epoch": 0.14109155625865138, "mean_token_accuracy": 0.6925675868988037, "num_tokens": 7848840.0, "step": 1427, "train/ce_loss": 1.0969041585922241 }, { "epoch": 0.14109155625865138, "step": 1427, "train/sim_loss": 0.12890625 }, { "epoch": 0.14109155625865138, "step": 1427, "train/total_loss": 0.23859667778015137 }, { "entropy": 9.412635803222656, "epoch": 0.1411904291081669, "mean_token_accuracy": 0.7309237122535706, "num_tokens": 7854149.0, "step": 1428, "train/ce_loss": 1.3343346118927002 }, { "epoch": 0.1411904291081669, "step": 1428, "train/sim_loss": 0.09375 }, { "epoch": 0.1411904291081669, "step": 1428, "train/total_loss": 0.22718346118927002 }, { "entropy": 9.54632568359375, "epoch": 0.1412893019576824, "mean_token_accuracy": 0.7394366264343262, "num_tokens": 7859370.0, "step": 1429, "train/ce_loss": 0.5620748400688171 }, { "epoch": 0.1412893019576824, "step": 1429, "train/sim_loss": 0.0859375 }, { "epoch": 0.1412893019576824, "step": 1429, "train/total_loss": 0.14214497804641724 }, { "entropy": 9.309700012207031, "epoch": 0.14138817480719795, "mean_token_accuracy": 0.7170068025588989, "num_tokens": 7864730.0, "step": 1430, "train/ce_loss": 0.7075472474098206 }, { "epoch": 0.14138817480719795, "step": 1430, "train/sim_loss": 0.0625 }, { "epoch": 0.14138817480719795, "step": 1430, "train/total_loss": 0.133254736661911 }, { "entropy": 8.986120223999023, "epoch": 0.14148704765671347, "mean_token_accuracy": 0.76243656873703, "num_tokens": 7870387.0, "step": 1431, "train/ce_loss": 0.9298255443572998 }, { "epoch": 0.14148704765671347, "step": 1431, "train/sim_loss": 0.12890625 }, { "epoch": 0.14148704765671347, "step": 1431, "train/total_loss": 0.22188881039619446 }, { "entropy": 9.60708236694336, "epoch": 0.14158592050622898, "mean_token_accuracy": 0.7684365510940552, "num_tokens": 7875629.0, "step": 1432, "train/ce_loss": 1.0031827688217163 }, { "epoch": 0.14158592050622898, "step": 1432, "train/sim_loss": 0.125 }, { "epoch": 0.14158592050622898, "step": 1432, "train/total_loss": 0.2253182828426361 }, { "entropy": 8.784000396728516, "epoch": 0.14168479335574452, "mean_token_accuracy": 0.7527472376823425, "num_tokens": 7881408.0, "step": 1433, "train/ce_loss": 0.8853614330291748 }, { "epoch": 0.14168479335574452, "step": 1433, "train/sim_loss": 0.109375 }, { "epoch": 0.14168479335574452, "step": 1433, "train/total_loss": 0.19791114330291748 }, { "entropy": 9.122184753417969, "epoch": 0.14178366620526003, "mean_token_accuracy": 0.7723880410194397, "num_tokens": 7886846.0, "step": 1434, "train/ce_loss": 0.5837305784225464 }, { "epoch": 0.14178366620526003, "step": 1434, "train/sim_loss": 0.046875 }, { "epoch": 0.14178366620526003, "step": 1434, "train/total_loss": 0.10524806380271912 }, { "entropy": 9.260372161865234, "epoch": 0.14188253905477555, "mean_token_accuracy": 0.7496671080589294, "num_tokens": 7892238.0, "step": 1435, "train/ce_loss": 0.7783677577972412 }, { "epoch": 0.14188253905477555, "step": 1435, "train/sim_loss": 0.08203125 }, { "epoch": 0.14188253905477555, "step": 1435, "train/total_loss": 0.1598680317401886 }, { "entropy": 9.13823127746582, "epoch": 0.1419814119042911, "mean_token_accuracy": 0.6882978677749634, "num_tokens": 7897806.0, "step": 1436, "train/ce_loss": 0.48492103815078735 }, { "epoch": 0.1419814119042911, "step": 1436, "train/sim_loss": 0.10546875 }, { "epoch": 0.1419814119042911, "step": 1436, "train/total_loss": 0.15396085381507874 }, { "entropy": 9.075601577758789, "epoch": 0.1420802847538066, "mean_token_accuracy": 0.73575758934021, "num_tokens": 7903292.0, "step": 1437, "train/ce_loss": 1.261603593826294 }, { "epoch": 0.1420802847538066, "step": 1437, "train/sim_loss": 0.08984375 }, { "epoch": 0.1420802847538066, "step": 1437, "train/total_loss": 0.2160041183233261 }, { "entropy": 9.426538467407227, "epoch": 0.14217915760332211, "mean_token_accuracy": 0.7484737634658813, "num_tokens": 7908606.0, "step": 1438, "train/ce_loss": 0.8277501463890076 }, { "epoch": 0.14217915760332211, "step": 1438, "train/sim_loss": 0.0859375 }, { "epoch": 0.14217915760332211, "step": 1438, "train/total_loss": 0.1687125265598297 }, { "entropy": 8.945484161376953, "epoch": 0.14227803045283766, "mean_token_accuracy": 0.7751479148864746, "num_tokens": 7914307.0, "step": 1439, "train/ce_loss": 0.7978754639625549 }, { "epoch": 0.14227803045283766, "step": 1439, "train/sim_loss": 0.1171875 }, { "epoch": 0.14227803045283766, "step": 1439, "train/total_loss": 0.19697505235671997 }, { "epoch": 0.14237690330235317, "grad_norm": 1.113887906074524, "learning_rate": 9.646689413044555e-06, "loss": 0.1662, "step": 1440 }, { "entropy": 9.316303253173828, "epoch": 0.14237690330235317, "mean_token_accuracy": 0.7304551005363464, "num_tokens": 7919736.0, "step": 1440, "train/ce_loss": 1.0885690450668335 }, { "epoch": 0.14237690330235317, "step": 1440, "train/sim_loss": 0.0703125 }, { "epoch": 0.14237690330235317, "step": 1440, "train/total_loss": 0.1791694164276123 }, { "entropy": 9.111612319946289, "epoch": 0.1424757761518687, "mean_token_accuracy": 0.7837541103363037, "num_tokens": 7925315.0, "step": 1441, "train/ce_loss": 0.7581990957260132 }, { "epoch": 0.1424757761518687, "step": 1441, "train/sim_loss": 0.1171875 }, { "epoch": 0.1424757761518687, "step": 1441, "train/total_loss": 0.19300740957260132 }, { "entropy": 9.500110626220703, "epoch": 0.14257464900138422, "mean_token_accuracy": 0.7234899401664734, "num_tokens": 7930678.0, "step": 1442, "train/ce_loss": 0.5060451030731201 }, { "epoch": 0.14257464900138422, "step": 1442, "train/sim_loss": 0.0703125 }, { "epoch": 0.14257464900138422, "step": 1442, "train/total_loss": 0.12091700732707977 }, { "entropy": 9.374744415283203, "epoch": 0.14267352185089974, "mean_token_accuracy": 0.7245145440101624, "num_tokens": 7936080.0, "step": 1443, "train/ce_loss": 0.9441497325897217 }, { "epoch": 0.14267352185089974, "step": 1443, "train/sim_loss": 0.046875 }, { "epoch": 0.14267352185089974, "step": 1443, "train/total_loss": 0.14128997921943665 }, { "entropy": 9.182491302490234, "epoch": 0.14277239470041528, "mean_token_accuracy": 0.6708595156669617, "num_tokens": 7941626.0, "step": 1444, "train/ce_loss": 1.012751817703247 }, { "epoch": 0.14277239470041528, "step": 1444, "train/sim_loss": 0.0703125 }, { "epoch": 0.14277239470041528, "step": 1444, "train/total_loss": 0.17158767580986023 }, { "entropy": 9.254650115966797, "epoch": 0.1428712675499308, "mean_token_accuracy": 0.7651715278625488, "num_tokens": 7947011.0, "step": 1445, "train/ce_loss": 0.7885429859161377 }, { "epoch": 0.1428712675499308, "step": 1445, "train/sim_loss": 0.05859375 }, { "epoch": 0.1428712675499308, "step": 1445, "train/total_loss": 0.1374480426311493 }, { "entropy": 9.136272430419922, "epoch": 0.1429701403994463, "mean_token_accuracy": 0.7485582232475281, "num_tokens": 7952488.0, "step": 1446, "train/ce_loss": 0.6755207777023315 }, { "epoch": 0.1429701403994463, "step": 1446, "train/sim_loss": 0.0859375 }, { "epoch": 0.1429701403994463, "step": 1446, "train/total_loss": 0.1534895896911621 }, { "entropy": 9.354090690612793, "epoch": 0.14306901324896185, "mean_token_accuracy": 0.7889150977134705, "num_tokens": 7957945.0, "step": 1447, "train/ce_loss": 0.5939737558364868 }, { "epoch": 0.14306901324896185, "step": 1447, "train/sim_loss": 0.07421875 }, { "epoch": 0.14306901324896185, "step": 1447, "train/total_loss": 0.1336161196231842 }, { "entropy": 8.840502738952637, "epoch": 0.14316788609847736, "mean_token_accuracy": 0.7245804667472839, "num_tokens": 7963540.0, "step": 1448, "train/ce_loss": 1.0706021785736084 }, { "epoch": 0.14316788609847736, "step": 1448, "train/sim_loss": 0.0703125 }, { "epoch": 0.14316788609847736, "step": 1448, "train/total_loss": 0.17737272381782532 }, { "entropy": 8.970861434936523, "epoch": 0.14326675894799287, "mean_token_accuracy": 0.7063953280448914, "num_tokens": 7969193.0, "step": 1449, "train/ce_loss": 1.2977765798568726 }, { "epoch": 0.14326675894799287, "step": 1449, "train/sim_loss": 0.11328125 }, { "epoch": 0.14326675894799287, "step": 1449, "train/total_loss": 0.24305890500545502 }, { "entropy": 9.26669692993164, "epoch": 0.1433656317975084, "mean_token_accuracy": 0.7766323089599609, "num_tokens": 7974673.0, "step": 1450, "train/ce_loss": 0.8148425221443176 }, { "epoch": 0.1433656317975084, "step": 1450, "train/sim_loss": 0.06640625 }, { "epoch": 0.1433656317975084, "step": 1450, "train/total_loss": 0.14789050817489624 }, { "entropy": 9.373781204223633, "epoch": 0.14346450464702393, "mean_token_accuracy": 0.7430657148361206, "num_tokens": 7980013.0, "step": 1451, "train/ce_loss": 1.0379115343093872 }, { "epoch": 0.14346450464702393, "step": 1451, "train/sim_loss": 0.078125 }, { "epoch": 0.14346450464702393, "step": 1451, "train/total_loss": 0.18191614747047424 }, { "entropy": 8.802631378173828, "epoch": 0.14356337749653944, "mean_token_accuracy": 0.692556619644165, "num_tokens": 7985875.0, "step": 1452, "train/ce_loss": 0.9387755393981934 }, { "epoch": 0.14356337749653944, "step": 1452, "train/sim_loss": 0.1171875 }, { "epoch": 0.14356337749653944, "step": 1452, "train/total_loss": 0.21106505393981934 }, { "entropy": 9.017794609069824, "epoch": 0.14366225034605498, "mean_token_accuracy": 0.7017017006874084, "num_tokens": 7991502.0, "step": 1453, "train/ce_loss": 0.5222609043121338 }, { "epoch": 0.14366225034605498, "step": 1453, "train/sim_loss": 0.03515625 }, { "epoch": 0.14366225034605498, "step": 1453, "train/total_loss": 0.08738234639167786 }, { "entropy": 9.562996864318848, "epoch": 0.1437611231955705, "mean_token_accuracy": 0.7363636493682861, "num_tokens": 7996877.0, "step": 1454, "train/ce_loss": 0.5800989270210266 }, { "epoch": 0.1437611231955705, "step": 1454, "train/sim_loss": 0.0703125 }, { "epoch": 0.1437611231955705, "step": 1454, "train/total_loss": 0.12832239270210266 }, { "entropy": 9.197663307189941, "epoch": 0.143859996045086, "mean_token_accuracy": 0.7205707430839539, "num_tokens": 8002336.0, "step": 1455, "train/ce_loss": 0.7865292429924011 }, { "epoch": 0.143859996045086, "step": 1455, "train/sim_loss": 0.09765625 }, { "epoch": 0.143859996045086, "step": 1455, "train/total_loss": 0.17630916833877563 }, { "entropy": 8.829607009887695, "epoch": 0.14395886889460155, "mean_token_accuracy": 0.7152900099754333, "num_tokens": 8008022.0, "step": 1456, "train/ce_loss": 0.5485926866531372 }, { "epoch": 0.14395886889460155, "step": 1456, "train/sim_loss": 0.08203125 }, { "epoch": 0.14395886889460155, "step": 1456, "train/total_loss": 0.13689051568508148 }, { "entropy": 9.262479782104492, "epoch": 0.14405774174411706, "mean_token_accuracy": 0.7612419724464417, "num_tokens": 8013496.0, "step": 1457, "train/ce_loss": 0.8270986676216125 }, { "epoch": 0.14405774174411706, "step": 1457, "train/sim_loss": 0.0390625 }, { "epoch": 0.14405774174411706, "step": 1457, "train/total_loss": 0.12177237123250961 }, { "entropy": 9.01937484741211, "epoch": 0.14415661459363258, "mean_token_accuracy": 0.7513227462768555, "num_tokens": 8018961.0, "step": 1458, "train/ce_loss": 0.9025052785873413 }, { "epoch": 0.14415661459363258, "step": 1458, "train/sim_loss": 0.06640625 }, { "epoch": 0.14415661459363258, "step": 1458, "train/total_loss": 0.15665677189826965 }, { "entropy": 9.160143852233887, "epoch": 0.14425548744314812, "mean_token_accuracy": 0.757080614566803, "num_tokens": 8024430.0, "step": 1459, "train/ce_loss": 0.8914438486099243 }, { "epoch": 0.14425548744314812, "step": 1459, "train/sim_loss": 0.0859375 }, { "epoch": 0.14425548744314812, "step": 1459, "train/total_loss": 0.17508187890052795 }, { "epoch": 0.14435436029266363, "grad_norm": 1.0184705257415771, "learning_rate": 9.641744548286605e-06, "loss": 0.159, "step": 1460 }, { "entropy": 9.157100677490234, "epoch": 0.14435436029266363, "mean_token_accuracy": 0.7165714502334595, "num_tokens": 8029834.0, "step": 1460, "train/ce_loss": 0.7077667117118835 }, { "epoch": 0.14435436029266363, "step": 1460, "train/sim_loss": 0.078125 }, { "epoch": 0.14435436029266363, "step": 1460, "train/total_loss": 0.14890167117118835 }, { "entropy": 9.343542098999023, "epoch": 0.14445323314217914, "mean_token_accuracy": 0.6902106404304504, "num_tokens": 8035281.0, "step": 1461, "train/ce_loss": 0.9006768465042114 }, { "epoch": 0.14445323314217914, "step": 1461, "train/sim_loss": 0.1171875 }, { "epoch": 0.14445323314217914, "step": 1461, "train/total_loss": 0.20725518465042114 }, { "entropy": 9.56401252746582, "epoch": 0.14455210599169468, "mean_token_accuracy": 0.7610965967178345, "num_tokens": 8040631.0, "step": 1462, "train/ce_loss": 0.5351071953773499 }, { "epoch": 0.14455210599169468, "step": 1462, "train/sim_loss": 0.0546875 }, { "epoch": 0.14455210599169468, "step": 1462, "train/total_loss": 0.10819822549819946 }, { "entropy": 9.15139102935791, "epoch": 0.1446509788412102, "mean_token_accuracy": 0.7698154449462891, "num_tokens": 8046238.0, "step": 1463, "train/ce_loss": 0.6718717813491821 }, { "epoch": 0.1446509788412102, "step": 1463, "train/sim_loss": 0.0390625 }, { "epoch": 0.1446509788412102, "step": 1463, "train/total_loss": 0.10624968260526657 }, { "entropy": 8.861726760864258, "epoch": 0.14474985169072574, "mean_token_accuracy": 0.7088502645492554, "num_tokens": 8051984.0, "step": 1464, "train/ce_loss": 0.5099020600318909 }, { "epoch": 0.14474985169072574, "step": 1464, "train/sim_loss": 0.10546875 }, { "epoch": 0.14474985169072574, "step": 1464, "train/total_loss": 0.15645895898342133 }, { "entropy": 9.207891464233398, "epoch": 0.14484872454024125, "mean_token_accuracy": 0.732300877571106, "num_tokens": 8057470.0, "step": 1465, "train/ce_loss": 1.425228476524353 }, { "epoch": 0.14484872454024125, "step": 1465, "train/sim_loss": 0.08984375 }, { "epoch": 0.14484872454024125, "step": 1465, "train/total_loss": 0.23236660659313202 }, { "entropy": 9.272872924804688, "epoch": 0.14494759738975677, "mean_token_accuracy": 0.7417640686035156, "num_tokens": 8062951.0, "step": 1466, "train/ce_loss": 0.6314181685447693 }, { "epoch": 0.14494759738975677, "step": 1466, "train/sim_loss": 0.05078125 }, { "epoch": 0.14494759738975677, "step": 1466, "train/total_loss": 0.11392306536436081 }, { "entropy": 8.986576080322266, "epoch": 0.1450464702392723, "mean_token_accuracy": 0.7474644780158997, "num_tokens": 8068497.0, "step": 1467, "train/ce_loss": 0.6571402549743652 }, { "epoch": 0.1450464702392723, "step": 1467, "train/sim_loss": 0.046875 }, { "epoch": 0.1450464702392723, "step": 1467, "train/total_loss": 0.1125890240073204 }, { "entropy": 9.300371170043945, "epoch": 0.14514534308878782, "mean_token_accuracy": 0.7078891396522522, "num_tokens": 8074092.0, "step": 1468, "train/ce_loss": 1.3419071435928345 }, { "epoch": 0.14514534308878782, "step": 1468, "train/sim_loss": 0.11328125 }, { "epoch": 0.14514534308878782, "step": 1468, "train/total_loss": 0.24747197329998016 }, { "entropy": 9.235715866088867, "epoch": 0.14524421593830333, "mean_token_accuracy": 0.7450980544090271, "num_tokens": 8079608.0, "step": 1469, "train/ce_loss": 0.49872124195098877 }, { "epoch": 0.14524421593830333, "step": 1469, "train/sim_loss": 0.03515625 }, { "epoch": 0.14524421593830333, "step": 1469, "train/total_loss": 0.08502838015556335 }, { "entropy": 9.356996536254883, "epoch": 0.14534308878781887, "mean_token_accuracy": 0.8107784390449524, "num_tokens": 8085109.0, "step": 1470, "train/ce_loss": 0.9987492561340332 }, { "epoch": 0.14534308878781887, "step": 1470, "train/sim_loss": 0.03515625 }, { "epoch": 0.14534308878781887, "step": 1470, "train/total_loss": 0.13503117859363556 }, { "entropy": 8.872389793395996, "epoch": 0.1454419616373344, "mean_token_accuracy": 0.6865509748458862, "num_tokens": 8090700.0, "step": 1471, "train/ce_loss": 1.0638185739517212 }, { "epoch": 0.1454419616373344, "step": 1471, "train/sim_loss": 0.109375 }, { "epoch": 0.1454419616373344, "step": 1471, "train/total_loss": 0.2157568633556366 }, { "entropy": 9.187093734741211, "epoch": 0.1455408344868499, "mean_token_accuracy": 0.7494407296180725, "num_tokens": 8096303.0, "step": 1472, "train/ce_loss": 1.167836308479309 }, { "epoch": 0.1455408344868499, "step": 1472, "train/sim_loss": 0.08203125 }, { "epoch": 0.1455408344868499, "step": 1472, "train/total_loss": 0.19881488382816315 }, { "entropy": 9.653594970703125, "epoch": 0.14563970733636544, "mean_token_accuracy": 0.7349926829338074, "num_tokens": 8101476.0, "step": 1473, "train/ce_loss": 0.609351634979248 }, { "epoch": 0.14563970733636544, "step": 1473, "train/sim_loss": 0.0546875 }, { "epoch": 0.14563970733636544, "step": 1473, "train/total_loss": 0.11562266945838928 }, { "entropy": 9.371780395507812, "epoch": 0.14573858018588096, "mean_token_accuracy": 0.7503136992454529, "num_tokens": 8106841.0, "step": 1474, "train/ce_loss": 1.0243980884552002 }, { "epoch": 0.14573858018588096, "step": 1474, "train/sim_loss": 0.078125 }, { "epoch": 0.14573858018588096, "step": 1474, "train/total_loss": 0.18056482076644897 }, { "entropy": 9.529854774475098, "epoch": 0.14583745303539647, "mean_token_accuracy": 0.7551020383834839, "num_tokens": 8112126.0, "step": 1475, "train/ce_loss": 0.4797247648239136 }, { "epoch": 0.14583745303539647, "step": 1475, "train/sim_loss": 0.109375 }, { "epoch": 0.14583745303539647, "step": 1475, "train/total_loss": 0.15734747052192688 }, { "entropy": 9.32413101196289, "epoch": 0.145936325884912, "mean_token_accuracy": 0.713936448097229, "num_tokens": 8117570.0, "step": 1476, "train/ce_loss": 1.3238039016723633 }, { "epoch": 0.145936325884912, "step": 1476, "train/sim_loss": 0.125 }, { "epoch": 0.145936325884912, "step": 1476, "train/total_loss": 0.2573803961277008 }, { "entropy": 9.353128433227539, "epoch": 0.14603519873442752, "mean_token_accuracy": 0.7411630749702454, "num_tokens": 8123069.0, "step": 1477, "train/ce_loss": 0.655774712562561 }, { "epoch": 0.14603519873442752, "step": 1477, "train/sim_loss": 0.0859375 }, { "epoch": 0.14603519873442752, "step": 1477, "train/total_loss": 0.15151497721672058 }, { "entropy": 9.56015396118164, "epoch": 0.14613407158394304, "mean_token_accuracy": 0.7115384340286255, "num_tokens": 8128368.0, "step": 1478, "train/ce_loss": 0.7193466424942017 }, { "epoch": 0.14613407158394304, "step": 1478, "train/sim_loss": 0.078125 }, { "epoch": 0.14613407158394304, "step": 1478, "train/total_loss": 0.15005967020988464 }, { "entropy": 9.05007553100586, "epoch": 0.14623294443345858, "mean_token_accuracy": 0.684518039226532, "num_tokens": 8134018.0, "step": 1479, "train/ce_loss": 1.3233824968338013 }, { "epoch": 0.14623294443345858, "step": 1479, "train/sim_loss": 0.06640625 }, { "epoch": 0.14623294443345858, "step": 1479, "train/total_loss": 0.1987445056438446 }, { "epoch": 0.1463318172829741, "grad_norm": 1.0874214172363281, "learning_rate": 9.636799683528656e-06, "loss": 0.1624, "step": 1480 }, { "entropy": 9.121602058410645, "epoch": 0.1463318172829741, "mean_token_accuracy": 0.7489539980888367, "num_tokens": 8139642.0, "step": 1480, "train/ce_loss": 1.173570990562439 }, { "epoch": 0.1463318172829741, "step": 1480, "train/sim_loss": 0.1953125 }, { "epoch": 0.1463318172829741, "step": 1480, "train/total_loss": 0.3126696050167084 }, { "entropy": 9.227030754089355, "epoch": 0.1464306901324896, "mean_token_accuracy": 0.6848049163818359, "num_tokens": 8145213.0, "step": 1481, "train/ce_loss": 0.8534135222434998 }, { "epoch": 0.1464306901324896, "step": 1481, "train/sim_loss": 0.0703125 }, { "epoch": 0.1464306901324896, "step": 1481, "train/total_loss": 0.15565386414527893 }, { "entropy": 9.065132141113281, "epoch": 0.14652956298200515, "mean_token_accuracy": 0.7308160662651062, "num_tokens": 8150685.0, "step": 1482, "train/ce_loss": 1.3169811964035034 }, { "epoch": 0.14652956298200515, "step": 1482, "train/sim_loss": 0.0703125 }, { "epoch": 0.14652956298200515, "step": 1482, "train/total_loss": 0.2020106166601181 }, { "entropy": 8.99348258972168, "epoch": 0.14662843583152066, "mean_token_accuracy": 0.6997219920158386, "num_tokens": 8156277.0, "step": 1483, "train/ce_loss": 0.35185790061950684 }, { "epoch": 0.14662843583152066, "step": 1483, "train/sim_loss": 0.0546875 }, { "epoch": 0.14662843583152066, "step": 1483, "train/total_loss": 0.0898732915520668 }, { "entropy": 8.60426139831543, "epoch": 0.1467273086810362, "mean_token_accuracy": 0.7058399319648743, "num_tokens": 8162248.0, "step": 1484, "train/ce_loss": 0.7734392881393433 }, { "epoch": 0.1467273086810362, "step": 1484, "train/sim_loss": 0.1484375 }, { "epoch": 0.1467273086810362, "step": 1484, "train/total_loss": 0.22578144073486328 }, { "entropy": 9.09343147277832, "epoch": 0.1468261815305517, "mean_token_accuracy": 0.8329875469207764, "num_tokens": 8167765.0, "step": 1485, "train/ce_loss": 0.3718021512031555 }, { "epoch": 0.1468261815305517, "step": 1485, "train/sim_loss": 0.05859375 }, { "epoch": 0.1468261815305517, "step": 1485, "train/total_loss": 0.09577396512031555 }, { "entropy": 9.293237686157227, "epoch": 0.14692505438006723, "mean_token_accuracy": 0.7153652310371399, "num_tokens": 8173082.0, "step": 1486, "train/ce_loss": 0.9441676139831543 }, { "epoch": 0.14692505438006723, "step": 1486, "train/sim_loss": 0.0859375 }, { "epoch": 0.14692505438006723, "step": 1486, "train/total_loss": 0.1803542673587799 }, { "entropy": 9.315635681152344, "epoch": 0.14702392722958277, "mean_token_accuracy": 0.7303240895271301, "num_tokens": 8178532.0, "step": 1487, "train/ce_loss": 0.7251583933830261 }, { "epoch": 0.14702392722958277, "step": 1487, "train/sim_loss": 0.07421875 }, { "epoch": 0.14702392722958277, "step": 1487, "train/total_loss": 0.1467345952987671 }, { "entropy": 9.250520706176758, "epoch": 0.14712280007909828, "mean_token_accuracy": 0.7418513894081116, "num_tokens": 8183909.0, "step": 1488, "train/ce_loss": 0.7250444889068604 }, { "epoch": 0.14712280007909828, "step": 1488, "train/sim_loss": 0.05078125 }, { "epoch": 0.14712280007909828, "step": 1488, "train/total_loss": 0.1232857033610344 }, { "entropy": 9.202627182006836, "epoch": 0.1472216729286138, "mean_token_accuracy": 0.7270501852035522, "num_tokens": 8189378.0, "step": 1489, "train/ce_loss": 0.7287471890449524 }, { "epoch": 0.1472216729286138, "step": 1489, "train/sim_loss": 0.0390625 }, { "epoch": 0.1472216729286138, "step": 1489, "train/total_loss": 0.11193721741437912 }, { "entropy": 8.889427185058594, "epoch": 0.14732054577812934, "mean_token_accuracy": 0.7097902297973633, "num_tokens": 8195117.0, "step": 1490, "train/ce_loss": 0.8258557319641113 }, { "epoch": 0.14732054577812934, "step": 1490, "train/sim_loss": 0.1015625 }, { "epoch": 0.14732054577812934, "step": 1490, "train/total_loss": 0.18414807319641113 }, { "entropy": 9.163433074951172, "epoch": 0.14741941862764485, "mean_token_accuracy": 0.7472793459892273, "num_tokens": 8200520.0, "step": 1491, "train/ce_loss": 0.7120727300643921 }, { "epoch": 0.14741941862764485, "step": 1491, "train/sim_loss": 0.0859375 }, { "epoch": 0.14741941862764485, "step": 1491, "train/total_loss": 0.15714478492736816 }, { "entropy": 9.161109924316406, "epoch": 0.14751829147716036, "mean_token_accuracy": 0.7488636374473572, "num_tokens": 8206073.0, "step": 1492, "train/ce_loss": 0.7670421004295349 }, { "epoch": 0.14751829147716036, "step": 1492, "train/sim_loss": 0.078125 }, { "epoch": 0.14751829147716036, "step": 1492, "train/total_loss": 0.154829204082489 }, { "entropy": 9.053354263305664, "epoch": 0.1476171643266759, "mean_token_accuracy": 0.7361563444137573, "num_tokens": 8211624.0, "step": 1493, "train/ce_loss": 0.42847588658332825 }, { "epoch": 0.1476171643266759, "step": 1493, "train/sim_loss": 0.0703125 }, { "epoch": 0.1476171643266759, "step": 1493, "train/total_loss": 0.11316008865833282 }, { "entropy": 9.14576244354248, "epoch": 0.14771603717619142, "mean_token_accuracy": 0.7467948794364929, "num_tokens": 8217194.0, "step": 1494, "train/ce_loss": 1.2590277194976807 }, { "epoch": 0.14771603717619142, "step": 1494, "train/sim_loss": 0.10546875 }, { "epoch": 0.14771603717619142, "step": 1494, "train/total_loss": 0.23137152194976807 }, { "entropy": 9.177995681762695, "epoch": 0.14781491002570693, "mean_token_accuracy": 0.7610333561897278, "num_tokens": 8222761.0, "step": 1495, "train/ce_loss": 0.5266033411026001 }, { "epoch": 0.14781491002570693, "step": 1495, "train/sim_loss": 0.0859375 }, { "epoch": 0.14781491002570693, "step": 1495, "train/total_loss": 0.13859783113002777 }, { "entropy": 9.68423080444336, "epoch": 0.14791378287522247, "mean_token_accuracy": 0.7421203255653381, "num_tokens": 8227980.0, "step": 1496, "train/ce_loss": 1.1455931663513184 }, { "epoch": 0.14791378287522247, "step": 1496, "train/sim_loss": 0.046875 }, { "epoch": 0.14791378287522247, "step": 1496, "train/total_loss": 0.1614343225955963 }, { "entropy": 8.764913558959961, "epoch": 0.14801265572473798, "mean_token_accuracy": 0.7536585330963135, "num_tokens": 8233791.0, "step": 1497, "train/ce_loss": 0.3673393428325653 }, { "epoch": 0.14801265572473798, "step": 1497, "train/sim_loss": 0.04296875 }, { "epoch": 0.14801265572473798, "step": 1497, "train/total_loss": 0.07970269024372101 }, { "entropy": 8.944366455078125, "epoch": 0.1481115285742535, "mean_token_accuracy": 0.6684587597846985, "num_tokens": 8239547.0, "step": 1498, "train/ce_loss": 0.7205097079277039 }, { "epoch": 0.1481115285742535, "step": 1498, "train/sim_loss": 0.16015625 }, { "epoch": 0.1481115285742535, "step": 1498, "train/total_loss": 0.23220722377300262 }, { "entropy": 9.064006805419922, "epoch": 0.14821040142376904, "mean_token_accuracy": 0.7294589281082153, "num_tokens": 8245159.0, "step": 1499, "train/ce_loss": 0.9856253862380981 }, { "epoch": 0.14821040142376904, "step": 1499, "train/sim_loss": 0.125 }, { "epoch": 0.14821040142376904, "step": 1499, "train/total_loss": 0.22356253862380981 }, { "epoch": 0.14830927427328455, "grad_norm": 0.9872274994850159, "learning_rate": 9.631854818770708e-06, "loss": 0.1642, "step": 1500 }, { "entropy": 8.993939399719238, "epoch": 0.14830927427328455, "mean_token_accuracy": 0.7149901390075684, "num_tokens": 8250746.0, "step": 1500, "train/ce_loss": 0.8511725068092346 }, { "epoch": 0.14830927427328455, "step": 1500, "train/sim_loss": 0.05078125 }, { "epoch": 0.14830927427328455, "step": 1500, "train/total_loss": 0.13589850068092346 }, { "entropy": 9.123931884765625, "epoch": 0.14840814712280007, "mean_token_accuracy": 0.7241746783256531, "num_tokens": 8256376.0, "step": 1501, "train/ce_loss": 0.6595689058303833 }, { "epoch": 0.14840814712280007, "step": 1501, "train/sim_loss": 0.0859375 }, { "epoch": 0.14840814712280007, "step": 1501, "train/total_loss": 0.15189439058303833 }, { "entropy": 9.219168663024902, "epoch": 0.1485070199723156, "mean_token_accuracy": 0.6964490413665771, "num_tokens": 8261854.0, "step": 1502, "train/ce_loss": 1.5336631536483765 }, { "epoch": 0.1485070199723156, "step": 1502, "train/sim_loss": 0.13671875 }, { "epoch": 0.1485070199723156, "step": 1502, "train/total_loss": 0.2900850772857666 }, { "entropy": 9.356778144836426, "epoch": 0.14860589282183112, "mean_token_accuracy": 0.7557997703552246, "num_tokens": 8267296.0, "step": 1503, "train/ce_loss": 0.6862707138061523 }, { "epoch": 0.14860589282183112, "step": 1503, "train/sim_loss": 0.0234375 }, { "epoch": 0.14860589282183112, "step": 1503, "train/total_loss": 0.09206457436084747 }, { "entropy": 9.183387756347656, "epoch": 0.14870476567134666, "mean_token_accuracy": 0.7379310131072998, "num_tokens": 8272783.0, "step": 1504, "train/ce_loss": 0.7619252800941467 }, { "epoch": 0.14870476567134666, "step": 1504, "train/sim_loss": 0.03125 }, { "epoch": 0.14870476567134666, "step": 1504, "train/total_loss": 0.10744252800941467 }, { "entropy": 9.372587203979492, "epoch": 0.14880363852086217, "mean_token_accuracy": 0.7906976938247681, "num_tokens": 8278231.0, "step": 1505, "train/ce_loss": 1.1134233474731445 }, { "epoch": 0.14880363852086217, "step": 1505, "train/sim_loss": 0.09765625 }, { "epoch": 0.14880363852086217, "step": 1505, "train/total_loss": 0.20899859070777893 }, { "entropy": 9.47115707397461, "epoch": 0.1489025113703777, "mean_token_accuracy": 0.7634690999984741, "num_tokens": 8283574.0, "step": 1506, "train/ce_loss": 1.1157681941986084 }, { "epoch": 0.1489025113703777, "step": 1506, "train/sim_loss": 0.1015625 }, { "epoch": 0.1489025113703777, "step": 1506, "train/total_loss": 0.21313932538032532 }, { "entropy": 9.48731803894043, "epoch": 0.14900138421989323, "mean_token_accuracy": 0.7503410577774048, "num_tokens": 8288910.0, "step": 1507, "train/ce_loss": 0.9126212000846863 }, { "epoch": 0.14900138421989323, "step": 1507, "train/sim_loss": 0.08984375 }, { "epoch": 0.14900138421989323, "step": 1507, "train/total_loss": 0.18110588192939758 }, { "entropy": 9.076700210571289, "epoch": 0.14910025706940874, "mean_token_accuracy": 0.724952757358551, "num_tokens": 8294533.0, "step": 1508, "train/ce_loss": 1.4346047639846802 }, { "epoch": 0.14910025706940874, "step": 1508, "train/sim_loss": 0.1484375 }, { "epoch": 0.14910025706940874, "step": 1508, "train/total_loss": 0.2918979823589325 }, { "entropy": 9.402483940124512, "epoch": 0.14919912991892426, "mean_token_accuracy": 0.743628203868866, "num_tokens": 8299843.0, "step": 1509, "train/ce_loss": 1.1148216724395752 }, { "epoch": 0.14919912991892426, "step": 1509, "train/sim_loss": 0.08984375 }, { "epoch": 0.14919912991892426, "step": 1509, "train/total_loss": 0.201325923204422 }, { "entropy": 9.627172470092773, "epoch": 0.1492980027684398, "mean_token_accuracy": 0.7444751262664795, "num_tokens": 8305211.0, "step": 1510, "train/ce_loss": 0.9091204404830933 }, { "epoch": 0.1492980027684398, "step": 1510, "train/sim_loss": 0.07421875 }, { "epoch": 0.1492980027684398, "step": 1510, "train/total_loss": 0.16513079404830933 }, { "entropy": 9.067307472229004, "epoch": 0.1493968756179553, "mean_token_accuracy": 0.7574660778045654, "num_tokens": 8310815.0, "step": 1511, "train/ce_loss": 1.4474284648895264 }, { "epoch": 0.1493968756179553, "step": 1511, "train/sim_loss": 0.05859375 }, { "epoch": 0.1493968756179553, "step": 1511, "train/total_loss": 0.20333659648895264 }, { "entropy": 9.270044326782227, "epoch": 0.14949574846747082, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 8316210.0, "step": 1512, "train/ce_loss": 0.8186160326004028 }, { "epoch": 0.14949574846747082, "step": 1512, "train/sim_loss": 0.10546875 }, { "epoch": 0.14949574846747082, "step": 1512, "train/total_loss": 0.18733036518096924 }, { "entropy": 9.149812698364258, "epoch": 0.14959462131698636, "mean_token_accuracy": 0.7633587718009949, "num_tokens": 8321748.0, "step": 1513, "train/ce_loss": 0.6878855228424072 }, { "epoch": 0.14959462131698636, "step": 1513, "train/sim_loss": 0.0859375 }, { "epoch": 0.14959462131698636, "step": 1513, "train/total_loss": 0.1547260582447052 }, { "entropy": 9.342554092407227, "epoch": 0.14969349416650188, "mean_token_accuracy": 0.7185929417610168, "num_tokens": 8327128.0, "step": 1514, "train/ce_loss": 0.8039326667785645 }, { "epoch": 0.14969349416650188, "step": 1514, "train/sim_loss": 0.12109375 }, { "epoch": 0.14969349416650188, "step": 1514, "train/total_loss": 0.20148701965808868 }, { "entropy": 9.265625, "epoch": 0.1497923670160174, "mean_token_accuracy": 0.7109557390213013, "num_tokens": 8332627.0, "step": 1515, "train/ce_loss": 0.76774662733078 }, { "epoch": 0.1497923670160174, "step": 1515, "train/sim_loss": 0.0546875 }, { "epoch": 0.1497923670160174, "step": 1515, "train/total_loss": 0.13146215677261353 }, { "entropy": 9.270820617675781, "epoch": 0.14989123986553293, "mean_token_accuracy": 0.7301387190818787, "num_tokens": 8338050.0, "step": 1516, "train/ce_loss": 0.6814010739326477 }, { "epoch": 0.14989123986553293, "step": 1516, "train/sim_loss": 0.0703125 }, { "epoch": 0.14989123986553293, "step": 1516, "train/total_loss": 0.13845261931419373 }, { "entropy": 9.410277366638184, "epoch": 0.14999011271504845, "mean_token_accuracy": 0.772251307964325, "num_tokens": 8343425.0, "step": 1517, "train/ce_loss": 0.6596589684486389 }, { "epoch": 0.14999011271504845, "step": 1517, "train/sim_loss": 0.07421875 }, { "epoch": 0.14999011271504845, "step": 1517, "train/total_loss": 0.14018464088439941 }, { "entropy": 9.872255325317383, "epoch": 0.15008898556456396, "mean_token_accuracy": 0.7629757523536682, "num_tokens": 8348618.0, "step": 1518, "train/ce_loss": 0.6789988875389099 }, { "epoch": 0.15008898556456396, "step": 1518, "train/sim_loss": 0.13671875 }, { "epoch": 0.15008898556456396, "step": 1518, "train/total_loss": 0.2046186327934265 }, { "entropy": 9.319687843322754, "epoch": 0.1501878584140795, "mean_token_accuracy": 0.7404305934906006, "num_tokens": 8354093.0, "step": 1519, "train/ce_loss": 0.9258723855018616 }, { "epoch": 0.1501878584140795, "step": 1519, "train/sim_loss": 0.08203125 }, { "epoch": 0.1501878584140795, "step": 1519, "train/total_loss": 0.17461848258972168 }, { "epoch": 0.150286731263595, "grad_norm": 1.0361205339431763, "learning_rate": 9.626909954012758e-06, "loss": 0.1616, "step": 1520 }, { "entropy": 9.239152908325195, "epoch": 0.150286731263595, "mean_token_accuracy": 0.7470511198043823, "num_tokens": 8359479.0, "step": 1520, "train/ce_loss": 0.846073567867279 }, { "epoch": 0.150286731263595, "step": 1520, "train/sim_loss": 0.15234375 }, { "epoch": 0.150286731263595, "step": 1520, "train/total_loss": 0.23695111274719238 }, { "entropy": 9.016777038574219, "epoch": 0.15038560411311053, "mean_token_accuracy": 0.7166494131088257, "num_tokens": 8365100.0, "step": 1521, "train/ce_loss": 0.7957527041435242 }, { "epoch": 0.15038560411311053, "step": 1521, "train/sim_loss": 0.0703125 }, { "epoch": 0.15038560411311053, "step": 1521, "train/total_loss": 0.14988777041435242 }, { "entropy": 9.526752471923828, "epoch": 0.15048447696262607, "mean_token_accuracy": 0.7245590090751648, "num_tokens": 8370414.0, "step": 1522, "train/ce_loss": 0.890181303024292 }, { "epoch": 0.15048447696262607, "step": 1522, "train/sim_loss": 0.12890625 }, { "epoch": 0.15048447696262607, "step": 1522, "train/total_loss": 0.21792438626289368 }, { "entropy": 9.180547714233398, "epoch": 0.15058334981214158, "mean_token_accuracy": 0.6863181591033936, "num_tokens": 8375912.0, "step": 1523, "train/ce_loss": 1.0768736600875854 }, { "epoch": 0.15058334981214158, "step": 1523, "train/sim_loss": 0.125 }, { "epoch": 0.15058334981214158, "step": 1523, "train/total_loss": 0.23268736898899078 }, { "entropy": 9.46397590637207, "epoch": 0.15068222266165712, "mean_token_accuracy": 0.7324841022491455, "num_tokens": 8381342.0, "step": 1524, "train/ce_loss": 0.9020326137542725 }, { "epoch": 0.15068222266165712, "step": 1524, "train/sim_loss": 0.078125 }, { "epoch": 0.15068222266165712, "step": 1524, "train/total_loss": 0.16832825541496277 }, { "entropy": 9.403278350830078, "epoch": 0.15078109551117264, "mean_token_accuracy": 0.7262357473373413, "num_tokens": 8386696.0, "step": 1525, "train/ce_loss": 0.7973259687423706 }, { "epoch": 0.15078109551117264, "step": 1525, "train/sim_loss": 0.078125 }, { "epoch": 0.15078109551117264, "step": 1525, "train/total_loss": 0.15785759687423706 }, { "entropy": 9.216888427734375, "epoch": 0.15087996836068815, "mean_token_accuracy": 0.8122270703315735, "num_tokens": 8392140.0, "step": 1526, "train/ce_loss": 0.4771326780319214 }, { "epoch": 0.15087996836068815, "step": 1526, "train/sim_loss": 0.05859375 }, { "epoch": 0.15087996836068815, "step": 1526, "train/total_loss": 0.1063070148229599 }, { "entropy": 9.577276229858398, "epoch": 0.1509788412102037, "mean_token_accuracy": 0.7654476761817932, "num_tokens": 8397430.0, "step": 1527, "train/ce_loss": 0.5853715538978577 }, { "epoch": 0.1509788412102037, "step": 1527, "train/sim_loss": 0.0390625 }, { "epoch": 0.1509788412102037, "step": 1527, "train/total_loss": 0.09759965538978577 }, { "entropy": 9.344989776611328, "epoch": 0.1510777140597192, "mean_token_accuracy": 0.6682188510894775, "num_tokens": 8402942.0, "step": 1528, "train/ce_loss": 0.7507085204124451 }, { "epoch": 0.1510777140597192, "step": 1528, "train/sim_loss": 0.0703125 }, { "epoch": 0.1510777140597192, "step": 1528, "train/total_loss": 0.14538335800170898 }, { "entropy": 9.36527156829834, "epoch": 0.15117658690923472, "mean_token_accuracy": 0.737726092338562, "num_tokens": 8408338.0, "step": 1529, "train/ce_loss": 0.5468299388885498 }, { "epoch": 0.15117658690923472, "step": 1529, "train/sim_loss": 0.06640625 }, { "epoch": 0.15117658690923472, "step": 1529, "train/total_loss": 0.12108924984931946 }, { "entropy": 9.215781211853027, "epoch": 0.15127545975875026, "mean_token_accuracy": 0.7166085839271545, "num_tokens": 8413823.0, "step": 1530, "train/ce_loss": 0.9639360904693604 }, { "epoch": 0.15127545975875026, "step": 1530, "train/sim_loss": 0.09765625 }, { "epoch": 0.15127545975875026, "step": 1530, "train/total_loss": 0.1940498650074005 }, { "entropy": 9.548506736755371, "epoch": 0.15137433260826577, "mean_token_accuracy": 0.7273972630500793, "num_tokens": 8419090.0, "step": 1531, "train/ce_loss": 0.9104582667350769 }, { "epoch": 0.15137433260826577, "step": 1531, "train/sim_loss": 0.078125 }, { "epoch": 0.15137433260826577, "step": 1531, "train/total_loss": 0.1691708266735077 }, { "entropy": 8.978182792663574, "epoch": 0.15147320545778128, "mean_token_accuracy": 0.7158878445625305, "num_tokens": 8424791.0, "step": 1532, "train/ce_loss": 0.7822136878967285 }, { "epoch": 0.15147320545778128, "step": 1532, "train/sim_loss": 0.08984375 }, { "epoch": 0.15147320545778128, "step": 1532, "train/total_loss": 0.1680651307106018 }, { "entropy": 9.472941398620605, "epoch": 0.15157207830729683, "mean_token_accuracy": 0.7664596438407898, "num_tokens": 8430366.0, "step": 1533, "train/ce_loss": 0.5607481598854065 }, { "epoch": 0.15157207830729683, "step": 1533, "train/sim_loss": 0.03125 }, { "epoch": 0.15157207830729683, "step": 1533, "train/total_loss": 0.08732481300830841 }, { "entropy": 9.349041938781738, "epoch": 0.15167095115681234, "mean_token_accuracy": 0.7659863829612732, "num_tokens": 8435647.0, "step": 1534, "train/ce_loss": 0.7744155526161194 }, { "epoch": 0.15167095115681234, "step": 1534, "train/sim_loss": 0.05859375 }, { "epoch": 0.15167095115681234, "step": 1534, "train/total_loss": 0.13603530824184418 }, { "entropy": 9.115571975708008, "epoch": 0.15176982400632785, "mean_token_accuracy": 0.7478540539741516, "num_tokens": 8441208.0, "step": 1535, "train/ce_loss": 0.8660216331481934 }, { "epoch": 0.15176982400632785, "step": 1535, "train/sim_loss": 0.078125 }, { "epoch": 0.15176982400632785, "step": 1535, "train/total_loss": 0.16472716629505157 }, { "entropy": 9.155649185180664, "epoch": 0.1518686968558434, "mean_token_accuracy": 0.670258641242981, "num_tokens": 8446734.0, "step": 1536, "train/ce_loss": 2.0274810791015625 }, { "epoch": 0.1518686968558434, "step": 1536, "train/sim_loss": 0.0703125 }, { "epoch": 0.1518686968558434, "step": 1536, "train/total_loss": 0.2730606198310852 }, { "entropy": 9.214540481567383, "epoch": 0.1519675697053589, "mean_token_accuracy": 0.7708860635757446, "num_tokens": 8452146.0, "step": 1537, "train/ce_loss": 0.5340110659599304 }, { "epoch": 0.1519675697053589, "step": 1537, "train/sim_loss": 0.0625 }, { "epoch": 0.1519675697053589, "step": 1537, "train/total_loss": 0.11590111255645752 }, { "entropy": 9.210163116455078, "epoch": 0.15206644255487442, "mean_token_accuracy": 0.7429854273796082, "num_tokens": 8457532.0, "step": 1538, "train/ce_loss": 0.42482709884643555 }, { "epoch": 0.15206644255487442, "step": 1538, "train/sim_loss": 0.05078125 }, { "epoch": 0.15206644255487442, "step": 1538, "train/total_loss": 0.09326396137475967 }, { "entropy": 8.905191421508789, "epoch": 0.15216531540438996, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 8463209.0, "step": 1539, "train/ce_loss": 0.5104749202728271 }, { "epoch": 0.15216531540438996, "step": 1539, "train/sim_loss": 0.046875 }, { "epoch": 0.15216531540438996, "step": 1539, "train/total_loss": 0.09792248904705048 }, { "epoch": 0.15226418825390547, "grad_norm": 1.0044260025024414, "learning_rate": 9.62196508925481e-06, "loss": 0.1666, "step": 1540 }, { "entropy": 9.546414375305176, "epoch": 0.15226418825390547, "mean_token_accuracy": 0.7562326788902283, "num_tokens": 8468573.0, "step": 1540, "train/ce_loss": 0.6715470552444458 }, { "epoch": 0.15226418825390547, "step": 1540, "train/sim_loss": 0.0859375 }, { "epoch": 0.15226418825390547, "step": 1540, "train/total_loss": 0.15309220552444458 }, { "entropy": 9.302713394165039, "epoch": 0.152363061103421, "mean_token_accuracy": 0.7932242751121521, "num_tokens": 8473987.0, "step": 1541, "train/ce_loss": 0.320748507976532 }, { "epoch": 0.152363061103421, "step": 1541, "train/sim_loss": 0.0859375 }, { "epoch": 0.152363061103421, "step": 1541, "train/total_loss": 0.11801235377788544 }, { "entropy": 9.179939270019531, "epoch": 0.15246193395293653, "mean_token_accuracy": 0.7160883545875549, "num_tokens": 8479566.0, "step": 1542, "train/ce_loss": 0.5203182101249695 }, { "epoch": 0.15246193395293653, "step": 1542, "train/sim_loss": 0.09765625 }, { "epoch": 0.15246193395293653, "step": 1542, "train/total_loss": 0.14968806505203247 }, { "entropy": 9.079798698425293, "epoch": 0.15256080680245204, "mean_token_accuracy": 0.7414141297340393, "num_tokens": 8485240.0, "step": 1543, "train/ce_loss": 0.6861346364021301 }, { "epoch": 0.15256080680245204, "step": 1543, "train/sim_loss": 0.140625 }, { "epoch": 0.15256080680245204, "step": 1543, "train/total_loss": 0.2092384696006775 }, { "entropy": 9.373895645141602, "epoch": 0.15265967965196756, "mean_token_accuracy": 0.7767969965934753, "num_tokens": 8490684.0, "step": 1544, "train/ce_loss": 0.6305524110794067 }, { "epoch": 0.15265967965196756, "step": 1544, "train/sim_loss": 0.03125 }, { "epoch": 0.15265967965196756, "step": 1544, "train/total_loss": 0.09430523961782455 }, { "entropy": 9.241270065307617, "epoch": 0.1527585525014831, "mean_token_accuracy": 0.7103762626647949, "num_tokens": 8496184.0, "step": 1545, "train/ce_loss": 1.0496549606323242 }, { "epoch": 0.1527585525014831, "step": 1545, "train/sim_loss": 0.09375 }, { "epoch": 0.1527585525014831, "step": 1545, "train/total_loss": 0.19871550798416138 }, { "entropy": 9.091014862060547, "epoch": 0.1528574253509986, "mean_token_accuracy": 0.7107142806053162, "num_tokens": 8501673.0, "step": 1546, "train/ce_loss": 0.8932127356529236 }, { "epoch": 0.1528574253509986, "step": 1546, "train/sim_loss": 0.05859375 }, { "epoch": 0.1528574253509986, "step": 1546, "train/total_loss": 0.1479150354862213 }, { "entropy": 9.389715194702148, "epoch": 0.15295629820051415, "mean_token_accuracy": 0.73575758934021, "num_tokens": 8507052.0, "step": 1547, "train/ce_loss": 0.35262659192085266 }, { "epoch": 0.15295629820051415, "step": 1547, "train/sim_loss": 0.08203125 }, { "epoch": 0.15295629820051415, "step": 1547, "train/total_loss": 0.11729390919208527 }, { "entropy": 9.15912914276123, "epoch": 0.15305517105002966, "mean_token_accuracy": 0.766780436038971, "num_tokens": 8512572.0, "step": 1548, "train/ce_loss": 0.329415500164032 }, { "epoch": 0.15305517105002966, "step": 1548, "train/sim_loss": 0.05078125 }, { "epoch": 0.15305517105002966, "step": 1548, "train/total_loss": 0.0837228000164032 }, { "entropy": 9.184064865112305, "epoch": 0.15315404389954518, "mean_token_accuracy": 0.7096465826034546, "num_tokens": 8518216.0, "step": 1549, "train/ce_loss": 0.5383620858192444 }, { "epoch": 0.15315404389954518, "step": 1549, "train/sim_loss": 0.09375 }, { "epoch": 0.15315404389954518, "step": 1549, "train/total_loss": 0.14758621156215668 }, { "entropy": 9.580951690673828, "epoch": 0.15325291674906072, "mean_token_accuracy": 0.7110519409179688, "num_tokens": 8523572.0, "step": 1550, "train/ce_loss": 1.274329423904419 }, { "epoch": 0.15325291674906072, "step": 1550, "train/sim_loss": 0.05078125 }, { "epoch": 0.15325291674906072, "step": 1550, "train/total_loss": 0.1782141923904419 }, { "entropy": 9.516918182373047, "epoch": 0.15335178959857623, "mean_token_accuracy": 0.7414880394935608, "num_tokens": 8528921.0, "step": 1551, "train/ce_loss": 0.6680640578269958 }, { "epoch": 0.15335178959857623, "step": 1551, "train/sim_loss": 0.0546875 }, { "epoch": 0.15335178959857623, "step": 1551, "train/total_loss": 0.12149390578269958 }, { "entropy": 9.316458702087402, "epoch": 0.15345066244809175, "mean_token_accuracy": 0.7308743000030518, "num_tokens": 8534382.0, "step": 1552, "train/ce_loss": 1.1553354263305664 }, { "epoch": 0.15345066244809175, "step": 1552, "train/sim_loss": 0.11328125 }, { "epoch": 0.15345066244809175, "step": 1552, "train/total_loss": 0.22881479561328888 }, { "entropy": 9.219989776611328, "epoch": 0.1535495352976073, "mean_token_accuracy": 0.6947497129440308, "num_tokens": 8539847.0, "step": 1553, "train/ce_loss": 0.9810245037078857 }, { "epoch": 0.1535495352976073, "step": 1553, "train/sim_loss": 0.05078125 }, { "epoch": 0.1535495352976073, "step": 1553, "train/total_loss": 0.14888370037078857 }, { "entropy": 9.420021057128906, "epoch": 0.1536484081471228, "mean_token_accuracy": 0.7433628439903259, "num_tokens": 8545253.0, "step": 1554, "train/ce_loss": 0.6763415932655334 }, { "epoch": 0.1536484081471228, "step": 1554, "train/sim_loss": 0.078125 }, { "epoch": 0.1536484081471228, "step": 1554, "train/total_loss": 0.14575916528701782 }, { "entropy": 9.280158042907715, "epoch": 0.1537472809966383, "mean_token_accuracy": 0.7372262477874756, "num_tokens": 8550718.0, "step": 1555, "train/ce_loss": 0.9829088449478149 }, { "epoch": 0.1537472809966383, "step": 1555, "train/sim_loss": 0.13671875 }, { "epoch": 0.1537472809966383, "step": 1555, "train/total_loss": 0.23500964045524597 }, { "entropy": 9.251842498779297, "epoch": 0.15384615384615385, "mean_token_accuracy": 0.6630316376686096, "num_tokens": 8556273.0, "step": 1556, "train/ce_loss": 2.1254656314849854 }, { "epoch": 0.15384615384615385, "step": 1556, "train/sim_loss": 0.12890625 }, { "epoch": 0.15384615384615385, "step": 1556, "train/total_loss": 0.34145283699035645 }, { "entropy": 9.31187915802002, "epoch": 0.15394502669566937, "mean_token_accuracy": 0.7156984210014343, "num_tokens": 8561689.0, "step": 1557, "train/ce_loss": 0.8441640138626099 }, { "epoch": 0.15394502669566937, "step": 1557, "train/sim_loss": 0.07421875 }, { "epoch": 0.15394502669566937, "step": 1557, "train/total_loss": 0.15863515436649323 }, { "entropy": 9.546073913574219, "epoch": 0.15404389954518488, "mean_token_accuracy": 0.7245033383369446, "num_tokens": 8567183.0, "step": 1558, "train/ce_loss": 0.6809753179550171 }, { "epoch": 0.15404389954518488, "step": 1558, "train/sim_loss": 0.046875 }, { "epoch": 0.15404389954518488, "step": 1558, "train/total_loss": 0.11497253179550171 }, { "entropy": 8.934013366699219, "epoch": 0.15414277239470042, "mean_token_accuracy": 0.709718644618988, "num_tokens": 8572628.0, "step": 1559, "train/ce_loss": 0.9076169729232788 }, { "epoch": 0.15414277239470042, "step": 1559, "train/sim_loss": 0.08203125 }, { "epoch": 0.15414277239470042, "step": 1559, "train/total_loss": 0.1727929413318634 }, { "epoch": 0.15424164524421594, "grad_norm": 1.1541110277175903, "learning_rate": 9.617020224496861e-06, "loss": 0.1675, "step": 1560 }, { "entropy": 9.446640968322754, "epoch": 0.15424164524421594, "mean_token_accuracy": 0.6934487223625183, "num_tokens": 8578011.0, "step": 1560, "train/ce_loss": 1.292360544204712 }, { "epoch": 0.15424164524421594, "step": 1560, "train/sim_loss": 0.0625 }, { "epoch": 0.15424164524421594, "step": 1560, "train/total_loss": 0.19173605740070343 }, { "entropy": 9.251684188842773, "epoch": 0.15434051809373145, "mean_token_accuracy": 0.7248954176902771, "num_tokens": 8583544.0, "step": 1561, "train/ce_loss": 1.128238558769226 }, { "epoch": 0.15434051809373145, "step": 1561, "train/sim_loss": 0.04296875 }, { "epoch": 0.15434051809373145, "step": 1561, "train/total_loss": 0.15579260885715485 }, { "entropy": 9.306392669677734, "epoch": 0.154439390943247, "mean_token_accuracy": 0.7141148447990417, "num_tokens": 8589043.0, "step": 1562, "train/ce_loss": 1.0327736139297485 }, { "epoch": 0.154439390943247, "step": 1562, "train/sim_loss": 0.11328125 }, { "epoch": 0.154439390943247, "step": 1562, "train/total_loss": 0.21655860543251038 }, { "entropy": 9.048067092895508, "epoch": 0.1545382637927625, "mean_token_accuracy": 0.7267759442329407, "num_tokens": 8594645.0, "step": 1563, "train/ce_loss": 1.2221134901046753 }, { "epoch": 0.1545382637927625, "step": 1563, "train/sim_loss": 0.0703125 }, { "epoch": 0.1545382637927625, "step": 1563, "train/total_loss": 0.19252385199069977 }, { "entropy": 9.311325073242188, "epoch": 0.15463713664227802, "mean_token_accuracy": 0.7747858166694641, "num_tokens": 8600071.0, "step": 1564, "train/ce_loss": 1.0946104526519775 }, { "epoch": 0.15463713664227802, "step": 1564, "train/sim_loss": 0.0546875 }, { "epoch": 0.15463713664227802, "step": 1564, "train/total_loss": 0.16414853930473328 }, { "entropy": 9.451818466186523, "epoch": 0.15473600949179356, "mean_token_accuracy": 0.7394179701805115, "num_tokens": 8605458.0, "step": 1565, "train/ce_loss": 0.7393949031829834 }, { "epoch": 0.15473600949179356, "step": 1565, "train/sim_loss": 0.08984375 }, { "epoch": 0.15473600949179356, "step": 1565, "train/total_loss": 0.1637832522392273 }, { "entropy": 9.075830459594727, "epoch": 0.15483488234130907, "mean_token_accuracy": 0.7014925479888916, "num_tokens": 8611068.0, "step": 1566, "train/ce_loss": 0.74875807762146 }, { "epoch": 0.15483488234130907, "step": 1566, "train/sim_loss": 0.09375 }, { "epoch": 0.15483488234130907, "step": 1566, "train/total_loss": 0.16862580180168152 }, { "entropy": 9.142054557800293, "epoch": 0.1549337551908246, "mean_token_accuracy": 0.8035902976989746, "num_tokens": 8616682.0, "step": 1567, "train/ce_loss": 0.38282567262649536 }, { "epoch": 0.1549337551908246, "step": 1567, "train/sim_loss": 0.05078125 }, { "epoch": 0.1549337551908246, "step": 1567, "train/total_loss": 0.08906382322311401 }, { "entropy": 9.247254371643066, "epoch": 0.15503262804034013, "mean_token_accuracy": 0.676962673664093, "num_tokens": 8622052.0, "step": 1568, "train/ce_loss": 1.7686634063720703 }, { "epoch": 0.15503262804034013, "step": 1568, "train/sim_loss": 0.1171875 }, { "epoch": 0.15503262804034013, "step": 1568, "train/total_loss": 0.294053852558136 }, { "entropy": 9.055328369140625, "epoch": 0.15513150088985564, "mean_token_accuracy": 0.7603938579559326, "num_tokens": 8627620.0, "step": 1569, "train/ce_loss": 0.5529923439025879 }, { "epoch": 0.15513150088985564, "step": 1569, "train/sim_loss": 0.09765625 }, { "epoch": 0.15513150088985564, "step": 1569, "train/total_loss": 0.15295548737049103 }, { "entropy": 9.296239852905273, "epoch": 0.15523037373937118, "mean_token_accuracy": 0.74405437707901, "num_tokens": 8633095.0, "step": 1570, "train/ce_loss": 0.4851263165473938 }, { "epoch": 0.15523037373937118, "step": 1570, "train/sim_loss": 0.0390625 }, { "epoch": 0.15523037373937118, "step": 1570, "train/total_loss": 0.08757513761520386 }, { "entropy": 9.094669342041016, "epoch": 0.1553292465888867, "mean_token_accuracy": 0.7255092263221741, "num_tokens": 8638634.0, "step": 1571, "train/ce_loss": 0.8567712306976318 }, { "epoch": 0.1553292465888867, "step": 1571, "train/sim_loss": 0.08984375 }, { "epoch": 0.1553292465888867, "step": 1571, "train/total_loss": 0.1755208671092987 }, { "entropy": 9.106914520263672, "epoch": 0.1554281194384022, "mean_token_accuracy": 0.7234273552894592, "num_tokens": 8644194.0, "step": 1572, "train/ce_loss": 0.5203515887260437 }, { "epoch": 0.1554281194384022, "step": 1572, "train/sim_loss": 0.046875 }, { "epoch": 0.1554281194384022, "step": 1572, "train/total_loss": 0.09891016036272049 }, { "entropy": 9.118014335632324, "epoch": 0.15552699228791775, "mean_token_accuracy": 0.8313384056091309, "num_tokens": 8649739.0, "step": 1573, "train/ce_loss": 0.3981727361679077 }, { "epoch": 0.15552699228791775, "step": 1573, "train/sim_loss": 0.03125 }, { "epoch": 0.15552699228791775, "step": 1573, "train/total_loss": 0.07106727361679077 }, { "entropy": 9.14439582824707, "epoch": 0.15562586513743326, "mean_token_accuracy": 0.7903403043746948, "num_tokens": 8655260.0, "step": 1574, "train/ce_loss": 0.35552677512168884 }, { "epoch": 0.15562586513743326, "step": 1574, "train/sim_loss": 0.0546875 }, { "epoch": 0.15562586513743326, "step": 1574, "train/total_loss": 0.09024018049240112 }, { "entropy": 8.878335952758789, "epoch": 0.15572473798694877, "mean_token_accuracy": 0.7449275255203247, "num_tokens": 8660894.0, "step": 1575, "train/ce_loss": 0.6743720173835754 }, { "epoch": 0.15572473798694877, "step": 1575, "train/sim_loss": 0.04296875 }, { "epoch": 0.15572473798694877, "step": 1575, "train/total_loss": 0.11040595173835754 }, { "entropy": 9.350927352905273, "epoch": 0.15582361083646432, "mean_token_accuracy": 0.7354430556297302, "num_tokens": 8666349.0, "step": 1576, "train/ce_loss": 0.5829963684082031 }, { "epoch": 0.15582361083646432, "step": 1576, "train/sim_loss": 0.03515625 }, { "epoch": 0.15582361083646432, "step": 1576, "train/total_loss": 0.09345588833093643 }, { "entropy": 9.23637580871582, "epoch": 0.15592248368597983, "mean_token_accuracy": 0.7494529485702515, "num_tokens": 8671891.0, "step": 1577, "train/ce_loss": 0.6643129587173462 }, { "epoch": 0.15592248368597983, "step": 1577, "train/sim_loss": 0.0703125 }, { "epoch": 0.15592248368597983, "step": 1577, "train/total_loss": 0.13674379885196686 }, { "entropy": 9.287670135498047, "epoch": 0.15602135653549534, "mean_token_accuracy": 0.6965866088867188, "num_tokens": 8677377.0, "step": 1578, "train/ce_loss": 0.9719767570495605 }, { "epoch": 0.15602135653549534, "step": 1578, "train/sim_loss": 0.109375 }, { "epoch": 0.15602135653549534, "step": 1578, "train/total_loss": 0.20657268166542053 }, { "entropy": 9.566617965698242, "epoch": 0.15612022938501088, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 8682879.0, "step": 1579, "train/ce_loss": 0.4522360563278198 }, { "epoch": 0.15612022938501088, "step": 1579, "train/sim_loss": 0.0859375 }, { "epoch": 0.15612022938501088, "step": 1579, "train/total_loss": 0.13116110861301422 }, { "epoch": 0.1562191022345264, "grad_norm": 0.8316078782081604, "learning_rate": 9.612075359738912e-06, "loss": 0.1588, "step": 1580 }, { "entropy": 8.8673677444458, "epoch": 0.1562191022345264, "mean_token_accuracy": 0.764014482498169, "num_tokens": 8688614.0, "step": 1580, "train/ce_loss": 1.0173624753952026 }, { "epoch": 0.1562191022345264, "step": 1580, "train/sim_loss": 0.1015625 }, { "epoch": 0.1562191022345264, "step": 1580, "train/total_loss": 0.20329874753952026 }, { "entropy": 9.354850769042969, "epoch": 0.1563179750840419, "mean_token_accuracy": 0.7702871561050415, "num_tokens": 8694149.0, "step": 1581, "train/ce_loss": 1.1489191055297852 }, { "epoch": 0.1563179750840419, "step": 1581, "train/sim_loss": 0.078125 }, { "epoch": 0.1563179750840419, "step": 1581, "train/total_loss": 0.193016916513443 }, { "entropy": 8.776041030883789, "epoch": 0.15641684793355745, "mean_token_accuracy": 0.755926251411438, "num_tokens": 8699903.0, "step": 1582, "train/ce_loss": 1.5151480436325073 }, { "epoch": 0.15641684793355745, "step": 1582, "train/sim_loss": 0.0546875 }, { "epoch": 0.15641684793355745, "step": 1582, "train/total_loss": 0.20620231330394745 }, { "entropy": 9.418830871582031, "epoch": 0.15651572078307296, "mean_token_accuracy": 0.739072859287262, "num_tokens": 8705250.0, "step": 1583, "train/ce_loss": 0.6842243671417236 }, { "epoch": 0.15651572078307296, "step": 1583, "train/sim_loss": 0.05078125 }, { "epoch": 0.15651572078307296, "step": 1583, "train/total_loss": 0.11920368671417236 }, { "entropy": 9.247335433959961, "epoch": 0.15661459363258848, "mean_token_accuracy": 0.7536041736602783, "num_tokens": 8710678.0, "step": 1584, "train/ce_loss": 1.053579568862915 }, { "epoch": 0.15661459363258848, "step": 1584, "train/sim_loss": 0.10546875 }, { "epoch": 0.15661459363258848, "step": 1584, "train/total_loss": 0.21082670986652374 }, { "entropy": 9.164085388183594, "epoch": 0.15671346648210402, "mean_token_accuracy": 0.762806236743927, "num_tokens": 8716201.0, "step": 1585, "train/ce_loss": 0.5769314169883728 }, { "epoch": 0.15671346648210402, "step": 1585, "train/sim_loss": 0.12109375 }, { "epoch": 0.15671346648210402, "step": 1585, "train/total_loss": 0.17878688871860504 }, { "entropy": 9.297210693359375, "epoch": 0.15681233933161953, "mean_token_accuracy": 0.7476635575294495, "num_tokens": 8721558.0, "step": 1586, "train/ce_loss": 0.47411200404167175 }, { "epoch": 0.15681233933161953, "step": 1586, "train/sim_loss": 0.08984375 }, { "epoch": 0.15681233933161953, "step": 1586, "train/total_loss": 0.13725495338439941 }, { "entropy": 8.746498107910156, "epoch": 0.15691121218113507, "mean_token_accuracy": 0.7508928775787354, "num_tokens": 8727267.0, "step": 1587, "train/ce_loss": 0.907015323638916 }, { "epoch": 0.15691121218113507, "step": 1587, "train/sim_loss": 0.03125 }, { "epoch": 0.15691121218113507, "step": 1587, "train/total_loss": 0.12195153534412384 }, { "entropy": 9.323437690734863, "epoch": 0.1570100850306506, "mean_token_accuracy": 0.752173900604248, "num_tokens": 8732558.0, "step": 1588, "train/ce_loss": 0.5369629859924316 }, { "epoch": 0.1570100850306506, "step": 1588, "train/sim_loss": 0.07421875 }, { "epoch": 0.1570100850306506, "step": 1588, "train/total_loss": 0.12791505455970764 }, { "entropy": 9.525991439819336, "epoch": 0.1571089578801661, "mean_token_accuracy": 0.7577903866767883, "num_tokens": 8737851.0, "step": 1589, "train/ce_loss": 0.7084870338439941 }, { "epoch": 0.1571089578801661, "step": 1589, "train/sim_loss": 0.03515625 }, { "epoch": 0.1571089578801661, "step": 1589, "train/total_loss": 0.10600495338439941 }, { "entropy": 9.435827255249023, "epoch": 0.15720783072968164, "mean_token_accuracy": 0.738002598285675, "num_tokens": 8743285.0, "step": 1590, "train/ce_loss": 1.0443390607833862 }, { "epoch": 0.15720783072968164, "step": 1590, "train/sim_loss": 0.08203125 }, { "epoch": 0.15720783072968164, "step": 1590, "train/total_loss": 0.18646515905857086 }, { "entropy": 9.33414077758789, "epoch": 0.15730670357919715, "mean_token_accuracy": 0.7493765354156494, "num_tokens": 8748700.0, "step": 1591, "train/ce_loss": 0.5816449522972107 }, { "epoch": 0.15730670357919715, "step": 1591, "train/sim_loss": 0.0859375 }, { "epoch": 0.15730670357919715, "step": 1591, "train/total_loss": 0.14410199224948883 }, { "entropy": 9.554068565368652, "epoch": 0.15740557642871267, "mean_token_accuracy": 0.7255936861038208, "num_tokens": 8754042.0, "step": 1592, "train/ce_loss": 0.6091744899749756 }, { "epoch": 0.15740557642871267, "step": 1592, "train/sim_loss": 0.0546875 }, { "epoch": 0.15740557642871267, "step": 1592, "train/total_loss": 0.1156049519777298 }, { "entropy": 9.158269882202148, "epoch": 0.1575044492782282, "mean_token_accuracy": 0.8164627552032471, "num_tokens": 8759599.0, "step": 1593, "train/ce_loss": 0.6006219387054443 }, { "epoch": 0.1575044492782282, "step": 1593, "train/sim_loss": 0.109375 }, { "epoch": 0.1575044492782282, "step": 1593, "train/total_loss": 0.1694371998310089 }, { "entropy": 8.920272827148438, "epoch": 0.15760332212774372, "mean_token_accuracy": 0.7577497363090515, "num_tokens": 8765143.0, "step": 1594, "train/ce_loss": 0.8664013743400574 }, { "epoch": 0.15760332212774372, "step": 1594, "train/sim_loss": 0.0703125 }, { "epoch": 0.15760332212774372, "step": 1594, "train/total_loss": 0.1569526493549347 }, { "entropy": 9.282458305358887, "epoch": 0.15770219497725924, "mean_token_accuracy": 0.730140209197998, "num_tokens": 8770461.0, "step": 1595, "train/ce_loss": 1.054739236831665 }, { "epoch": 0.15770219497725924, "step": 1595, "train/sim_loss": 0.078125 }, { "epoch": 0.15770219497725924, "step": 1595, "train/total_loss": 0.18359893560409546 }, { "entropy": 9.226791381835938, "epoch": 0.15780106782677478, "mean_token_accuracy": 0.7839721441268921, "num_tokens": 8775931.0, "step": 1596, "train/ce_loss": 0.6525909900665283 }, { "epoch": 0.15780106782677478, "step": 1596, "train/sim_loss": 0.03125 }, { "epoch": 0.15780106782677478, "step": 1596, "train/total_loss": 0.09650909900665283 }, { "entropy": 9.162903785705566, "epoch": 0.1578999406762903, "mean_token_accuracy": 0.7251908183097839, "num_tokens": 8781466.0, "step": 1597, "train/ce_loss": 1.1582051515579224 }, { "epoch": 0.1578999406762903, "step": 1597, "train/sim_loss": 0.0859375 }, { "epoch": 0.1578999406762903, "step": 1597, "train/total_loss": 0.2017580270767212 }, { "entropy": 9.013277053833008, "epoch": 0.1579988135258058, "mean_token_accuracy": 0.7220956683158875, "num_tokens": 8787002.0, "step": 1598, "train/ce_loss": 1.3079602718353271 }, { "epoch": 0.1579988135258058, "step": 1598, "train/sim_loss": 0.08203125 }, { "epoch": 0.1579988135258058, "step": 1598, "train/total_loss": 0.21282728016376495 }, { "entropy": 9.411697387695312, "epoch": 0.15809768637532134, "mean_token_accuracy": 0.6966292262077332, "num_tokens": 8792328.0, "step": 1599, "train/ce_loss": 0.58868408203125 }, { "epoch": 0.15809768637532134, "step": 1599, "train/sim_loss": 0.08984375 }, { "epoch": 0.15809768637532134, "step": 1599, "train/total_loss": 0.148712158203125 }, { "epoch": 0.15819655922483686, "grad_norm": 0.9583412408828735, "learning_rate": 9.607130494980962e-06, "loss": 0.1541, "step": 1600 }, { "entropy": 8.820891380310059, "epoch": 0.15819655922483686, "mean_token_accuracy": 0.7144221663475037, "num_tokens": 8798060.0, "step": 1600, "train/ce_loss": 0.707119882106781 }, { "epoch": 0.15819655922483686, "step": 1600, "train/sim_loss": 0.09375 }, { "epoch": 0.15819655922483686, "step": 1600, "train/total_loss": 0.16446200013160706 }, { "entropy": 9.372772216796875, "epoch": 0.15829543207435237, "mean_token_accuracy": 0.7521578073501587, "num_tokens": 8803404.0, "step": 1601, "train/ce_loss": 0.7145566344261169 }, { "epoch": 0.15829543207435237, "step": 1601, "train/sim_loss": 0.07421875 }, { "epoch": 0.15829543207435237, "step": 1601, "train/total_loss": 0.14567440748214722 }, { "entropy": 9.317365646362305, "epoch": 0.1583943049238679, "mean_token_accuracy": 0.6812227368354797, "num_tokens": 8808862.0, "step": 1602, "train/ce_loss": 1.9266247749328613 }, { "epoch": 0.1583943049238679, "step": 1602, "train/sim_loss": 0.07421875 }, { "epoch": 0.1583943049238679, "step": 1602, "train/total_loss": 0.26688122749328613 }, { "entropy": 9.341109275817871, "epoch": 0.15849317777338343, "mean_token_accuracy": 0.7653562426567078, "num_tokens": 8814302.0, "step": 1603, "train/ce_loss": 0.6028212308883667 }, { "epoch": 0.15849317777338343, "step": 1603, "train/sim_loss": 0.02734375 }, { "epoch": 0.15849317777338343, "step": 1603, "train/total_loss": 0.08762587606906891 }, { "entropy": 9.369998931884766, "epoch": 0.15859205062289894, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 8819707.0, "step": 1604, "train/ce_loss": 0.9333735108375549 }, { "epoch": 0.15859205062289894, "step": 1604, "train/sim_loss": 0.09375 }, { "epoch": 0.15859205062289894, "step": 1604, "train/total_loss": 0.18708735704421997 }, { "entropy": 8.92708969116211, "epoch": 0.15869092347241448, "mean_token_accuracy": 0.800000011920929, "num_tokens": 8825345.0, "step": 1605, "train/ce_loss": 1.1062434911727905 }, { "epoch": 0.15869092347241448, "step": 1605, "train/sim_loss": 0.1015625 }, { "epoch": 0.15869092347241448, "step": 1605, "train/total_loss": 0.21218684315681458 }, { "entropy": 9.383434295654297, "epoch": 0.15878979632193, "mean_token_accuracy": 0.7562582492828369, "num_tokens": 8830760.0, "step": 1606, "train/ce_loss": 0.5480013489723206 }, { "epoch": 0.15878979632193, "step": 1606, "train/sim_loss": 0.1015625 }, { "epoch": 0.15878979632193, "step": 1606, "train/total_loss": 0.1563626378774643 }, { "entropy": 9.06509017944336, "epoch": 0.15888866917144553, "mean_token_accuracy": 0.7138554453849792, "num_tokens": 8836334.0, "step": 1607, "train/ce_loss": 0.6371885538101196 }, { "epoch": 0.15888866917144553, "step": 1607, "train/sim_loss": 0.06640625 }, { "epoch": 0.15888866917144553, "step": 1607, "train/total_loss": 0.13012510538101196 }, { "entropy": 9.108753204345703, "epoch": 0.15898754202096105, "mean_token_accuracy": 0.730434775352478, "num_tokens": 8841911.0, "step": 1608, "train/ce_loss": 0.5235300660133362 }, { "epoch": 0.15898754202096105, "step": 1608, "train/sim_loss": 0.03125 }, { "epoch": 0.15898754202096105, "step": 1608, "train/total_loss": 0.08360300958156586 }, { "entropy": 9.148024559020996, "epoch": 0.15908641487047656, "mean_token_accuracy": 0.7829977869987488, "num_tokens": 8847465.0, "step": 1609, "train/ce_loss": 0.7307268381118774 }, { "epoch": 0.15908641487047656, "step": 1609, "train/sim_loss": 0.0703125 }, { "epoch": 0.15908641487047656, "step": 1609, "train/total_loss": 0.14338518679141998 }, { "entropy": 9.340641975402832, "epoch": 0.1591852877199921, "mean_token_accuracy": 0.7205039858818054, "num_tokens": 8852965.0, "step": 1610, "train/ce_loss": 0.7988046407699585 }, { "epoch": 0.1591852877199921, "step": 1610, "train/sim_loss": 0.05859375 }, { "epoch": 0.1591852877199921, "step": 1610, "train/total_loss": 0.1384742259979248 }, { "entropy": 9.513772010803223, "epoch": 0.15928416056950762, "mean_token_accuracy": 0.6942148804664612, "num_tokens": 8858411.0, "step": 1611, "train/ce_loss": 1.7142342329025269 }, { "epoch": 0.15928416056950762, "step": 1611, "train/sim_loss": 0.0703125 }, { "epoch": 0.15928416056950762, "step": 1611, "train/total_loss": 0.24173592031002045 }, { "entropy": 9.542659759521484, "epoch": 0.15938303341902313, "mean_token_accuracy": 0.7421289086341858, "num_tokens": 8863709.0, "step": 1612, "train/ce_loss": 0.5493373870849609 }, { "epoch": 0.15938303341902313, "step": 1612, "train/sim_loss": 0.08203125 }, { "epoch": 0.15938303341902313, "step": 1612, "train/total_loss": 0.13696499168872833 }, { "entropy": 9.507055282592773, "epoch": 0.15948190626853867, "mean_token_accuracy": 0.7119078040122986, "num_tokens": 8869105.0, "step": 1613, "train/ce_loss": 0.6574373841285706 }, { "epoch": 0.15948190626853867, "step": 1613, "train/sim_loss": 0.0546875 }, { "epoch": 0.15948190626853867, "step": 1613, "train/total_loss": 0.12043123692274094 }, { "entropy": 9.449485778808594, "epoch": 0.15958077911805418, "mean_token_accuracy": 0.7059639096260071, "num_tokens": 8874383.0, "step": 1614, "train/ce_loss": 0.754055917263031 }, { "epoch": 0.15958077911805418, "step": 1614, "train/sim_loss": 0.0546875 }, { "epoch": 0.15958077911805418, "step": 1614, "train/total_loss": 0.13009309768676758 }, { "entropy": 9.122669219970703, "epoch": 0.1596796519675697, "mean_token_accuracy": 0.7378190159797668, "num_tokens": 8879878.0, "step": 1615, "train/ce_loss": 0.9600697755813599 }, { "epoch": 0.1596796519675697, "step": 1615, "train/sim_loss": 0.05078125 }, { "epoch": 0.1596796519675697, "step": 1615, "train/total_loss": 0.14678823947906494 }, { "entropy": 9.157553672790527, "epoch": 0.15977852481708524, "mean_token_accuracy": 0.7753623127937317, "num_tokens": 8885494.0, "step": 1616, "train/ce_loss": 0.8829214572906494 }, { "epoch": 0.15977852481708524, "step": 1616, "train/sim_loss": 0.16796875 }, { "epoch": 0.15977852481708524, "step": 1616, "train/total_loss": 0.2562609016895294 }, { "entropy": 9.63836669921875, "epoch": 0.15987739766660075, "mean_token_accuracy": 0.7281690239906311, "num_tokens": 8890780.0, "step": 1617, "train/ce_loss": 0.6130746006965637 }, { "epoch": 0.15987739766660075, "step": 1617, "train/sim_loss": 0.05859375 }, { "epoch": 0.15987739766660075, "step": 1617, "train/total_loss": 0.11990121006965637 }, { "entropy": 9.341784477233887, "epoch": 0.15997627051611626, "mean_token_accuracy": 0.75, "num_tokens": 8896235.0, "step": 1618, "train/ce_loss": 1.2587661743164062 }, { "epoch": 0.15997627051611626, "step": 1618, "train/sim_loss": 0.07421875 }, { "epoch": 0.15997627051611626, "step": 1618, "train/total_loss": 0.20009537041187286 }, { "entropy": 9.297994613647461, "epoch": 0.1600751433656318, "mean_token_accuracy": 0.8210757374763489, "num_tokens": 8901647.0, "step": 1619, "train/ce_loss": 0.5513429641723633 }, { "epoch": 0.1600751433656318, "step": 1619, "train/sim_loss": 0.02734375 }, { "epoch": 0.1600751433656318, "step": 1619, "train/total_loss": 0.08247804641723633 }, { "epoch": 0.16017401621514732, "grad_norm": 0.7175630927085876, "learning_rate": 9.602185630223014e-06, "loss": 0.1552, "step": 1620 }, { "entropy": 9.165353775024414, "epoch": 0.16017401621514732, "mean_token_accuracy": 0.7549019455909729, "num_tokens": 8907059.0, "step": 1620, "train/ce_loss": 0.7620391845703125 }, { "epoch": 0.16017401621514732, "step": 1620, "train/sim_loss": 0.04296875 }, { "epoch": 0.16017401621514732, "step": 1620, "train/total_loss": 0.11917266994714737 }, { "entropy": 8.933706283569336, "epoch": 0.16027288906466283, "mean_token_accuracy": 0.7843326926231384, "num_tokens": 8912719.0, "step": 1621, "train/ce_loss": 0.46412408351898193 }, { "epoch": 0.16027288906466283, "step": 1621, "train/sim_loss": 0.0390625 }, { "epoch": 0.16027288906466283, "step": 1621, "train/total_loss": 0.0854749083518982 }, { "entropy": 9.225966453552246, "epoch": 0.16037176191417837, "mean_token_accuracy": 0.7669441103935242, "num_tokens": 8918162.0, "step": 1622, "train/ce_loss": 0.7520866394042969 }, { "epoch": 0.16037176191417837, "step": 1622, "train/sim_loss": 0.12890625 }, { "epoch": 0.16037176191417837, "step": 1622, "train/total_loss": 0.2041149139404297 }, { "entropy": 8.868877410888672, "epoch": 0.1604706347636939, "mean_token_accuracy": 0.7562141418457031, "num_tokens": 8923901.0, "step": 1623, "train/ce_loss": 0.7151184678077698 }, { "epoch": 0.1604706347636939, "step": 1623, "train/sim_loss": 0.0546875 }, { "epoch": 0.1604706347636939, "step": 1623, "train/total_loss": 0.12619934976100922 }, { "entropy": 9.106497764587402, "epoch": 0.1605695076132094, "mean_token_accuracy": 0.771894097328186, "num_tokens": 8929444.0, "step": 1624, "train/ce_loss": 0.3290867209434509 }, { "epoch": 0.1605695076132094, "step": 1624, "train/sim_loss": 0.03515625 }, { "epoch": 0.1605695076132094, "step": 1624, "train/total_loss": 0.06806492805480957 }, { "entropy": 9.48556137084961, "epoch": 0.16066838046272494, "mean_token_accuracy": 0.754054069519043, "num_tokens": 8934735.0, "step": 1625, "train/ce_loss": 1.2316285371780396 }, { "epoch": 0.16066838046272494, "step": 1625, "train/sim_loss": 0.0859375 }, { "epoch": 0.16066838046272494, "step": 1625, "train/total_loss": 0.2091003656387329 }, { "entropy": 9.367485046386719, "epoch": 0.16076725331224045, "mean_token_accuracy": 0.6870229244232178, "num_tokens": 8940066.0, "step": 1626, "train/ce_loss": 1.233827829360962 }, { "epoch": 0.16076725331224045, "step": 1626, "train/sim_loss": 0.12890625 }, { "epoch": 0.16076725331224045, "step": 1626, "train/total_loss": 0.2522890269756317 }, { "entropy": 8.941427230834961, "epoch": 0.160866126161756, "mean_token_accuracy": 0.6874381899833679, "num_tokens": 8945668.0, "step": 1627, "train/ce_loss": 0.6154370903968811 }, { "epoch": 0.160866126161756, "step": 1627, "train/sim_loss": 0.07421875 }, { "epoch": 0.160866126161756, "step": 1627, "train/total_loss": 0.13576245307922363 }, { "entropy": 8.903136253356934, "epoch": 0.1609649990112715, "mean_token_accuracy": 0.7626076340675354, "num_tokens": 8951149.0, "step": 1628, "train/ce_loss": 0.607441782951355 }, { "epoch": 0.1609649990112715, "step": 1628, "train/sim_loss": 0.0859375 }, { "epoch": 0.1609649990112715, "step": 1628, "train/total_loss": 0.14668168127536774 }, { "entropy": 9.469399452209473, "epoch": 0.16106387186078702, "mean_token_accuracy": 0.727385401725769, "num_tokens": 8956452.0, "step": 1629, "train/ce_loss": 0.9414197206497192 }, { "epoch": 0.16106387186078702, "step": 1629, "train/sim_loss": 0.0859375 }, { "epoch": 0.16106387186078702, "step": 1629, "train/total_loss": 0.18007947504520416 }, { "entropy": 9.178739547729492, "epoch": 0.16116274471030256, "mean_token_accuracy": 0.6777408719062805, "num_tokens": 8961913.0, "step": 1630, "train/ce_loss": 1.320082187652588 }, { "epoch": 0.16116274471030256, "step": 1630, "train/sim_loss": 0.15625 }, { "epoch": 0.16116274471030256, "step": 1630, "train/total_loss": 0.28825822472572327 }, { "entropy": 8.92513656616211, "epoch": 0.16126161755981808, "mean_token_accuracy": 0.7434077262878418, "num_tokens": 8967588.0, "step": 1631, "train/ce_loss": 1.0132871866226196 }, { "epoch": 0.16126161755981808, "step": 1631, "train/sim_loss": 0.1171875 }, { "epoch": 0.16126161755981808, "step": 1631, "train/total_loss": 0.21851623058319092 }, { "entropy": 9.651358604431152, "epoch": 0.1613604904093336, "mean_token_accuracy": 0.7356495261192322, "num_tokens": 8972834.0, "step": 1632, "train/ce_loss": 0.4780392050743103 }, { "epoch": 0.1613604904093336, "step": 1632, "train/sim_loss": 0.08984375 }, { "epoch": 0.1613604904093336, "step": 1632, "train/total_loss": 0.13764767348766327 }, { "entropy": 9.578935623168945, "epoch": 0.16145936325884913, "mean_token_accuracy": 0.7496522665023804, "num_tokens": 8978158.0, "step": 1633, "train/ce_loss": 0.7718422412872314 }, { "epoch": 0.16145936325884913, "step": 1633, "train/sim_loss": 0.0859375 }, { "epoch": 0.16145936325884913, "step": 1633, "train/total_loss": 0.16312173008918762 }, { "entropy": 8.710001945495605, "epoch": 0.16155823610836464, "mean_token_accuracy": 0.7485119104385376, "num_tokens": 8984084.0, "step": 1634, "train/ce_loss": 0.21662509441375732 }, { "epoch": 0.16155823610836464, "step": 1634, "train/sim_loss": 0.03125 }, { "epoch": 0.16155823610836464, "step": 1634, "train/total_loss": 0.05291251093149185 }, { "entropy": 9.168803215026855, "epoch": 0.16165710895788016, "mean_token_accuracy": 0.752173900604248, "num_tokens": 8989657.0, "step": 1635, "train/ce_loss": 0.7669188380241394 }, { "epoch": 0.16165710895788016, "step": 1635, "train/sim_loss": 0.0625 }, { "epoch": 0.16165710895788016, "step": 1635, "train/total_loss": 0.1391918957233429 }, { "entropy": 8.915168762207031, "epoch": 0.1617559818073957, "mean_token_accuracy": 0.7997936010360718, "num_tokens": 8995285.0, "step": 1636, "train/ce_loss": 0.3957074284553528 }, { "epoch": 0.1617559818073957, "step": 1636, "train/sim_loss": 0.06640625 }, { "epoch": 0.1617559818073957, "step": 1636, "train/total_loss": 0.10597699880599976 }, { "entropy": 9.45190715789795, "epoch": 0.1618548546569112, "mean_token_accuracy": 0.8028169274330139, "num_tokens": 9000570.0, "step": 1637, "train/ce_loss": 0.5181426405906677 }, { "epoch": 0.1618548546569112, "step": 1637, "train/sim_loss": 0.046875 }, { "epoch": 0.1618548546569112, "step": 1637, "train/total_loss": 0.09868926554918289 }, { "entropy": 9.203298568725586, "epoch": 0.16195372750642673, "mean_token_accuracy": 0.7578215599060059, "num_tokens": 9006032.0, "step": 1638, "train/ce_loss": 0.831872820854187 }, { "epoch": 0.16195372750642673, "step": 1638, "train/sim_loss": 0.03125 }, { "epoch": 0.16195372750642673, "step": 1638, "train/total_loss": 0.1144372820854187 }, { "entropy": 9.689971923828125, "epoch": 0.16205260035594227, "mean_token_accuracy": 0.7806041240692139, "num_tokens": 9011263.0, "step": 1639, "train/ce_loss": 0.5091314911842346 }, { "epoch": 0.16205260035594227, "step": 1639, "train/sim_loss": 0.046875 }, { "epoch": 0.16205260035594227, "step": 1639, "train/total_loss": 0.09778815507888794 }, { "epoch": 0.16215147320545778, "grad_norm": 1.0788648128509521, "learning_rate": 9.597240765465065e-06, "loss": 0.1567, "step": 1640 }, { "entropy": 9.987569808959961, "epoch": 0.16215147320545778, "mean_token_accuracy": 0.7476635575294495, "num_tokens": 9016389.0, "step": 1640, "train/ce_loss": 0.575297474861145 }, { "epoch": 0.16215147320545778, "step": 1640, "train/sim_loss": 0.04296875 }, { "epoch": 0.16215147320545778, "step": 1640, "train/total_loss": 0.1004984974861145 }, { "entropy": 9.078866958618164, "epoch": 0.1622503460549733, "mean_token_accuracy": 0.758583664894104, "num_tokens": 9021901.0, "step": 1641, "train/ce_loss": 0.4968828558921814 }, { "epoch": 0.1622503460549733, "step": 1641, "train/sim_loss": 0.0234375 }, { "epoch": 0.1622503460549733, "step": 1641, "train/total_loss": 0.07312578707933426 }, { "entropy": 9.197505950927734, "epoch": 0.16234921890448883, "mean_token_accuracy": 0.7369614243507385, "num_tokens": 9027368.0, "step": 1642, "train/ce_loss": 1.0219746828079224 }, { "epoch": 0.16234921890448883, "step": 1642, "train/sim_loss": 0.0859375 }, { "epoch": 0.16234921890448883, "step": 1642, "train/total_loss": 0.18813496828079224 }, { "entropy": 9.073518753051758, "epoch": 0.16244809175400435, "mean_token_accuracy": 0.7030237317085266, "num_tokens": 9032949.0, "step": 1643, "train/ce_loss": 0.8183103203773499 }, { "epoch": 0.16244809175400435, "step": 1643, "train/sim_loss": 0.11328125 }, { "epoch": 0.16244809175400435, "step": 1643, "train/total_loss": 0.19511228799819946 }, { "entropy": 9.595041275024414, "epoch": 0.16254696460351986, "mean_token_accuracy": 0.7375178337097168, "num_tokens": 9038297.0, "step": 1644, "train/ce_loss": 0.5220864415168762 }, { "epoch": 0.16254696460351986, "step": 1644, "train/sim_loss": 0.08203125 }, { "epoch": 0.16254696460351986, "step": 1644, "train/total_loss": 0.13423989713191986 }, { "entropy": 9.401962280273438, "epoch": 0.1626458374530354, "mean_token_accuracy": 0.739130437374115, "num_tokens": 9043659.0, "step": 1645, "train/ce_loss": 0.5610272884368896 }, { "epoch": 0.1626458374530354, "step": 1645, "train/sim_loss": 0.08203125 }, { "epoch": 0.1626458374530354, "step": 1645, "train/total_loss": 0.1381339728832245 }, { "entropy": 9.420936584472656, "epoch": 0.16274471030255092, "mean_token_accuracy": 0.7213333249092102, "num_tokens": 9048964.0, "step": 1646, "train/ce_loss": 0.7972267270088196 }, { "epoch": 0.16274471030255092, "step": 1646, "train/sim_loss": 0.07421875 }, { "epoch": 0.16274471030255092, "step": 1646, "train/total_loss": 0.15394142270088196 }, { "entropy": 9.12226390838623, "epoch": 0.16284358315206643, "mean_token_accuracy": 0.6851063966751099, "num_tokens": 9054568.0, "step": 1647, "train/ce_loss": 0.9310101270675659 }, { "epoch": 0.16284358315206643, "step": 1647, "train/sim_loss": 0.09765625 }, { "epoch": 0.16284358315206643, "step": 1647, "train/total_loss": 0.19075727462768555 }, { "entropy": 9.083439826965332, "epoch": 0.16294245600158197, "mean_token_accuracy": 0.7209756374359131, "num_tokens": 9060159.0, "step": 1648, "train/ce_loss": 0.8328157663345337 }, { "epoch": 0.16294245600158197, "step": 1648, "train/sim_loss": 0.08203125 }, { "epoch": 0.16294245600158197, "step": 1648, "train/total_loss": 0.16531282663345337 }, { "entropy": 9.222625732421875, "epoch": 0.16304132885109748, "mean_token_accuracy": 0.7111383080482483, "num_tokens": 9065594.0, "step": 1649, "train/ce_loss": 1.526912808418274 }, { "epoch": 0.16304132885109748, "step": 1649, "train/sim_loss": 0.109375 }, { "epoch": 0.16304132885109748, "step": 1649, "train/total_loss": 0.2620663046836853 }, { "entropy": 9.398737907409668, "epoch": 0.16314020170061302, "mean_token_accuracy": 0.7147929072380066, "num_tokens": 9071002.0, "step": 1650, "train/ce_loss": 0.9241016507148743 }, { "epoch": 0.16314020170061302, "step": 1650, "train/sim_loss": 0.046875 }, { "epoch": 0.16314020170061302, "step": 1650, "train/total_loss": 0.13928517699241638 }, { "entropy": 9.059035301208496, "epoch": 0.16323907455012854, "mean_token_accuracy": 0.7489539980888367, "num_tokens": 9076540.0, "step": 1651, "train/ce_loss": 0.8418839573860168 }, { "epoch": 0.16323907455012854, "step": 1651, "train/sim_loss": 0.0703125 }, { "epoch": 0.16323907455012854, "step": 1651, "train/total_loss": 0.15450090169906616 }, { "entropy": 9.27216625213623, "epoch": 0.16333794739964405, "mean_token_accuracy": 0.7098214030265808, "num_tokens": 9082081.0, "step": 1652, "train/ce_loss": 0.639593243598938 }, { "epoch": 0.16333794739964405, "step": 1652, "train/sim_loss": 0.046875 }, { "epoch": 0.16333794739964405, "step": 1652, "train/total_loss": 0.11083432286977768 }, { "entropy": 9.3806791305542, "epoch": 0.1634368202491596, "mean_token_accuracy": 0.7147971391677856, "num_tokens": 9087499.0, "step": 1653, "train/ce_loss": 1.063983678817749 }, { "epoch": 0.1634368202491596, "step": 1653, "train/sim_loss": 0.1328125 }, { "epoch": 0.1634368202491596, "step": 1653, "train/total_loss": 0.23921087384223938 }, { "entropy": 9.237051010131836, "epoch": 0.1635356930986751, "mean_token_accuracy": 0.7038896083831787, "num_tokens": 9092967.0, "step": 1654, "train/ce_loss": 0.9912940859794617 }, { "epoch": 0.1635356930986751, "step": 1654, "train/sim_loss": 0.08203125 }, { "epoch": 0.1635356930986751, "step": 1654, "train/total_loss": 0.18116065859794617 }, { "entropy": 8.786413192749023, "epoch": 0.16363456594819062, "mean_token_accuracy": 0.75, "num_tokens": 9098777.0, "step": 1655, "train/ce_loss": 0.8534654974937439 }, { "epoch": 0.16363456594819062, "step": 1655, "train/sim_loss": 0.0390625 }, { "epoch": 0.16363456594819062, "step": 1655, "train/total_loss": 0.12440904974937439 }, { "entropy": 9.174128532409668, "epoch": 0.16373343879770616, "mean_token_accuracy": 0.7278911471366882, "num_tokens": 9104327.0, "step": 1656, "train/ce_loss": 0.8571991920471191 }, { "epoch": 0.16373343879770616, "step": 1656, "train/sim_loss": 0.0859375 }, { "epoch": 0.16373343879770616, "step": 1656, "train/total_loss": 0.17165741324424744 }, { "entropy": 9.20683479309082, "epoch": 0.16383231164722167, "mean_token_accuracy": 0.7097118496894836, "num_tokens": 9109808.0, "step": 1657, "train/ce_loss": 0.8352406024932861 }, { "epoch": 0.16383231164722167, "step": 1657, "train/sim_loss": 0.07421875 }, { "epoch": 0.16383231164722167, "step": 1657, "train/total_loss": 0.15774281322956085 }, { "entropy": 9.328764915466309, "epoch": 0.1639311844967372, "mean_token_accuracy": 0.6801007390022278, "num_tokens": 9115286.0, "step": 1658, "train/ce_loss": 0.9340882301330566 }, { "epoch": 0.1639311844967372, "step": 1658, "train/sim_loss": 0.12109375 }, { "epoch": 0.1639311844967372, "step": 1658, "train/total_loss": 0.21450257301330566 }, { "entropy": 9.485065460205078, "epoch": 0.16403005734625273, "mean_token_accuracy": 0.7352941036224365, "num_tokens": 9120703.0, "step": 1659, "train/ce_loss": 0.7388997673988342 }, { "epoch": 0.16403005734625273, "step": 1659, "train/sim_loss": 0.06640625 }, { "epoch": 0.16403005734625273, "step": 1659, "train/total_loss": 0.14029622077941895 }, { "epoch": 0.16412893019576824, "grad_norm": 0.9080350399017334, "learning_rate": 9.592295900707115e-06, "loss": 0.1704, "step": 1660 }, { "entropy": 9.391029357910156, "epoch": 0.16412893019576824, "mean_token_accuracy": 0.6836734414100647, "num_tokens": 9126115.0, "step": 1660, "train/ce_loss": 1.4721221923828125 }, { "epoch": 0.16412893019576824, "step": 1660, "train/sim_loss": 0.0625 }, { "epoch": 0.16412893019576824, "step": 1660, "train/total_loss": 0.2097122222185135 }, { "entropy": 8.955321311950684, "epoch": 0.16422780304528375, "mean_token_accuracy": 0.7642192244529724, "num_tokens": 9131820.0, "step": 1661, "train/ce_loss": 1.1226505041122437 }, { "epoch": 0.16422780304528375, "step": 1661, "train/sim_loss": 0.1015625 }, { "epoch": 0.16422780304528375, "step": 1661, "train/total_loss": 0.21382755041122437 }, { "entropy": 9.257125854492188, "epoch": 0.1643266758947993, "mean_token_accuracy": 0.801771879196167, "num_tokens": 9137381.0, "step": 1662, "train/ce_loss": 0.6096162796020508 }, { "epoch": 0.1643266758947993, "step": 1662, "train/sim_loss": 0.0625 }, { "epoch": 0.1643266758947993, "step": 1662, "train/total_loss": 0.12346163392066956 }, { "entropy": 9.273628234863281, "epoch": 0.1644255487443148, "mean_token_accuracy": 0.7740046977996826, "num_tokens": 9142813.0, "step": 1663, "train/ce_loss": 0.5592849254608154 }, { "epoch": 0.1644255487443148, "step": 1663, "train/sim_loss": 0.03515625 }, { "epoch": 0.1644255487443148, "step": 1663, "train/total_loss": 0.09108474850654602 }, { "entropy": 9.414695739746094, "epoch": 0.16452442159383032, "mean_token_accuracy": 0.7403726577758789, "num_tokens": 9148253.0, "step": 1664, "train/ce_loss": 0.42520394921302795 }, { "epoch": 0.16452442159383032, "step": 1664, "train/sim_loss": 0.0234375 }, { "epoch": 0.16452442159383032, "step": 1664, "train/total_loss": 0.06595789641141891 }, { "entropy": 9.508737564086914, "epoch": 0.16462329444334586, "mean_token_accuracy": 0.7590798735618591, "num_tokens": 9153628.0, "step": 1665, "train/ce_loss": 0.2650644779205322 }, { "epoch": 0.16462329444334586, "step": 1665, "train/sim_loss": 0.046875 }, { "epoch": 0.16462329444334586, "step": 1665, "train/total_loss": 0.0733814463019371 }, { "entropy": 9.290407180786133, "epoch": 0.16472216729286138, "mean_token_accuracy": 0.6843345165252686, "num_tokens": 9159098.0, "step": 1666, "train/ce_loss": 1.5189414024353027 }, { "epoch": 0.16472216729286138, "step": 1666, "train/sim_loss": 0.07421875 }, { "epoch": 0.16472216729286138, "step": 1666, "train/total_loss": 0.22611288726329803 }, { "entropy": 8.767293930053711, "epoch": 0.1648210401423769, "mean_token_accuracy": 0.7213973999023438, "num_tokens": 9164751.0, "step": 1667, "train/ce_loss": 1.2588108777999878 }, { "epoch": 0.1648210401423769, "step": 1667, "train/sim_loss": 0.0625 }, { "epoch": 0.1648210401423769, "step": 1667, "train/total_loss": 0.18838109076023102 }, { "entropy": 9.511958122253418, "epoch": 0.16491991299189243, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 9170088.0, "step": 1668, "train/ce_loss": 0.6767095327377319 }, { "epoch": 0.16491991299189243, "step": 1668, "train/sim_loss": 0.0625 }, { "epoch": 0.16491991299189243, "step": 1668, "train/total_loss": 0.13017095625400543 }, { "entropy": 9.618408203125, "epoch": 0.16501878584140794, "mean_token_accuracy": 0.7521489858627319, "num_tokens": 9175344.0, "step": 1669, "train/ce_loss": 0.534354031085968 }, { "epoch": 0.16501878584140794, "step": 1669, "train/sim_loss": 0.09765625 }, { "epoch": 0.16501878584140794, "step": 1669, "train/total_loss": 0.15109165012836456 }, { "entropy": 9.493858337402344, "epoch": 0.16511765869092349, "mean_token_accuracy": 0.729411780834198, "num_tokens": 9180695.0, "step": 1670, "train/ce_loss": 0.8100049495697021 }, { "epoch": 0.16511765869092349, "step": 1670, "train/sim_loss": 0.0625 }, { "epoch": 0.16511765869092349, "step": 1670, "train/total_loss": 0.14350050687789917 }, { "entropy": 8.893009185791016, "epoch": 0.165216531540439, "mean_token_accuracy": 0.7160983085632324, "num_tokens": 9186555.0, "step": 1671, "train/ce_loss": 1.6258196830749512 }, { "epoch": 0.165216531540439, "step": 1671, "train/sim_loss": 0.078125 }, { "epoch": 0.165216531540439, "step": 1671, "train/total_loss": 0.24070696532726288 }, { "entropy": 9.158964157104492, "epoch": 0.1653154043899545, "mean_token_accuracy": 0.7922999262809753, "num_tokens": 9192304.0, "step": 1672, "train/ce_loss": 0.4680096507072449 }, { "epoch": 0.1653154043899545, "step": 1672, "train/sim_loss": 0.05859375 }, { "epoch": 0.1653154043899545, "step": 1672, "train/total_loss": 0.10539472103118896 }, { "entropy": 9.1004638671875, "epoch": 0.16541427723947005, "mean_token_accuracy": 0.7413793206214905, "num_tokens": 9197815.0, "step": 1673, "train/ce_loss": 1.115813970565796 }, { "epoch": 0.16541427723947005, "step": 1673, "train/sim_loss": 0.0703125 }, { "epoch": 0.16541427723947005, "step": 1673, "train/total_loss": 0.18189390003681183 }, { "entropy": 9.386797904968262, "epoch": 0.16551315008898557, "mean_token_accuracy": 0.7235890030860901, "num_tokens": 9203176.0, "step": 1674, "train/ce_loss": 1.5325413942337036 }, { "epoch": 0.16551315008898557, "step": 1674, "train/sim_loss": 0.109375 }, { "epoch": 0.16551315008898557, "step": 1674, "train/total_loss": 0.2626291513442993 }, { "entropy": 8.975360870361328, "epoch": 0.16561202293850108, "mean_token_accuracy": 0.7804123759269714, "num_tokens": 9208804.0, "step": 1675, "train/ce_loss": 0.7389521598815918 }, { "epoch": 0.16561202293850108, "step": 1675, "train/sim_loss": 0.03125 }, { "epoch": 0.16561202293850108, "step": 1675, "train/total_loss": 0.10514521598815918 }, { "entropy": 9.45688247680664, "epoch": 0.16571089578801662, "mean_token_accuracy": 0.7439490556716919, "num_tokens": 9214195.0, "step": 1676, "train/ce_loss": 0.8241857290267944 }, { "epoch": 0.16571089578801662, "step": 1676, "train/sim_loss": 0.109375 }, { "epoch": 0.16571089578801662, "step": 1676, "train/total_loss": 0.19179357588291168 }, { "entropy": 9.549358367919922, "epoch": 0.16580976863753213, "mean_token_accuracy": 0.7390691041946411, "num_tokens": 9219527.0, "step": 1677, "train/ce_loss": 0.6159114837646484 }, { "epoch": 0.16580976863753213, "step": 1677, "train/sim_loss": 0.02734375 }, { "epoch": 0.16580976863753213, "step": 1677, "train/total_loss": 0.08893489837646484 }, { "entropy": 9.262736320495605, "epoch": 0.16590864148704765, "mean_token_accuracy": 0.724672257900238, "num_tokens": 9224992.0, "step": 1678, "train/ce_loss": 0.5811233520507812 }, { "epoch": 0.16590864148704765, "step": 1678, "train/sim_loss": 0.1171875 }, { "epoch": 0.16590864148704765, "step": 1678, "train/total_loss": 0.17529983818531036 }, { "entropy": 8.879371643066406, "epoch": 0.1660075143365632, "mean_token_accuracy": 0.716481626033783, "num_tokens": 9230958.0, "step": 1679, "train/ce_loss": 1.6890853643417358 }, { "epoch": 0.1660075143365632, "step": 1679, "train/sim_loss": 0.078125 }, { "epoch": 0.1660075143365632, "step": 1679, "train/total_loss": 0.24703353643417358 }, { "epoch": 0.1661063871860787, "grad_norm": 0.9907097220420837, "learning_rate": 9.587351035949168e-06, "loss": 0.157, "step": 1680 }, { "entropy": 9.396444320678711, "epoch": 0.1661063871860787, "mean_token_accuracy": 0.6965240836143494, "num_tokens": 9236350.0, "step": 1680, "train/ce_loss": 0.6371164321899414 }, { "epoch": 0.1661063871860787, "step": 1680, "train/sim_loss": 0.03515625 }, { "epoch": 0.1661063871860787, "step": 1680, "train/total_loss": 0.09886789321899414 }, { "entropy": 9.178564071655273, "epoch": 0.16620526003559422, "mean_token_accuracy": 0.7672131061553955, "num_tokens": 9241845.0, "step": 1681, "train/ce_loss": 0.9518706202507019 }, { "epoch": 0.16620526003559422, "step": 1681, "train/sim_loss": 0.05859375 }, { "epoch": 0.16620526003559422, "step": 1681, "train/total_loss": 0.15378081798553467 }, { "entropy": 8.986527442932129, "epoch": 0.16630413288510976, "mean_token_accuracy": 0.7520161271095276, "num_tokens": 9247449.0, "step": 1682, "train/ce_loss": 0.9932412505149841 }, { "epoch": 0.16630413288510976, "step": 1682, "train/sim_loss": 0.0859375 }, { "epoch": 0.16630413288510976, "step": 1682, "train/total_loss": 0.18526163697242737 }, { "entropy": 9.32119083404541, "epoch": 0.16640300573462527, "mean_token_accuracy": 0.699881374835968, "num_tokens": 9252878.0, "step": 1683, "train/ce_loss": 0.8987904787063599 }, { "epoch": 0.16640300573462527, "step": 1683, "train/sim_loss": 0.08203125 }, { "epoch": 0.16640300573462527, "step": 1683, "train/total_loss": 0.17191030085086823 }, { "entropy": 9.114182472229004, "epoch": 0.16650187858414078, "mean_token_accuracy": 0.7165005803108215, "num_tokens": 9258415.0, "step": 1684, "train/ce_loss": 0.8592984676361084 }, { "epoch": 0.16650187858414078, "step": 1684, "train/sim_loss": 0.05078125 }, { "epoch": 0.16650187858414078, "step": 1684, "train/total_loss": 0.13671109080314636 }, { "entropy": 9.262504577636719, "epoch": 0.16660075143365632, "mean_token_accuracy": 0.7931442260742188, "num_tokens": 9263862.0, "step": 1685, "train/ce_loss": 0.6569254398345947 }, { "epoch": 0.16660075143365632, "step": 1685, "train/sim_loss": 0.0625 }, { "epoch": 0.16660075143365632, "step": 1685, "train/total_loss": 0.12819254398345947 }, { "entropy": 9.259771347045898, "epoch": 0.16669962428317184, "mean_token_accuracy": 0.7037861943244934, "num_tokens": 9269336.0, "step": 1686, "train/ce_loss": 1.4155524969100952 }, { "epoch": 0.16669962428317184, "step": 1686, "train/sim_loss": 0.078125 }, { "epoch": 0.16669962428317184, "step": 1686, "train/total_loss": 0.21968024969100952 }, { "entropy": 9.006634712219238, "epoch": 0.16679849713268735, "mean_token_accuracy": 0.751968502998352, "num_tokens": 9274966.0, "step": 1687, "train/ce_loss": 0.785474956035614 }, { "epoch": 0.16679849713268735, "step": 1687, "train/sim_loss": 0.03125 }, { "epoch": 0.16679849713268735, "step": 1687, "train/total_loss": 0.10979750007390976 }, { "entropy": 9.076419830322266, "epoch": 0.1668973699822029, "mean_token_accuracy": 0.7250945568084717, "num_tokens": 9280431.0, "step": 1688, "train/ce_loss": 0.6444607973098755 }, { "epoch": 0.1668973699822029, "step": 1688, "train/sim_loss": 0.07421875 }, { "epoch": 0.1668973699822029, "step": 1688, "train/total_loss": 0.1386648416519165 }, { "entropy": 9.529627799987793, "epoch": 0.1669962428317184, "mean_token_accuracy": 0.7277701497077942, "num_tokens": 9285875.0, "step": 1689, "train/ce_loss": 1.193678855895996 }, { "epoch": 0.1669962428317184, "step": 1689, "train/sim_loss": 0.1015625 }, { "epoch": 0.1669962428317184, "step": 1689, "train/total_loss": 0.22093039751052856 }, { "entropy": 9.32535171508789, "epoch": 0.16709511568123395, "mean_token_accuracy": 0.7508571147918701, "num_tokens": 9291341.0, "step": 1690, "train/ce_loss": 0.8057491183280945 }, { "epoch": 0.16709511568123395, "step": 1690, "train/sim_loss": 0.0703125 }, { "epoch": 0.16709511568123395, "step": 1690, "train/total_loss": 0.1508874148130417 }, { "entropy": 9.218406677246094, "epoch": 0.16719398853074946, "mean_token_accuracy": 0.7364253401756287, "num_tokens": 9296870.0, "step": 1691, "train/ce_loss": 0.7205709218978882 }, { "epoch": 0.16719398853074946, "step": 1691, "train/sim_loss": 0.0546875 }, { "epoch": 0.16719398853074946, "step": 1691, "train/total_loss": 0.1267445981502533 }, { "entropy": 9.058195114135742, "epoch": 0.16729286138026497, "mean_token_accuracy": 0.7167567610740662, "num_tokens": 9302355.0, "step": 1692, "train/ce_loss": 0.4878884255886078 }, { "epoch": 0.16729286138026497, "step": 1692, "train/sim_loss": 0.1328125 }, { "epoch": 0.16729286138026497, "step": 1692, "train/total_loss": 0.18160134553909302 }, { "entropy": 9.240180015563965, "epoch": 0.16739173422978051, "mean_token_accuracy": 0.7606635093688965, "num_tokens": 9307822.0, "step": 1693, "train/ce_loss": 1.1733222007751465 }, { "epoch": 0.16739173422978051, "step": 1693, "train/sim_loss": 0.109375 }, { "epoch": 0.16739173422978051, "step": 1693, "train/total_loss": 0.22670722007751465 }, { "entropy": 9.068025588989258, "epoch": 0.16749060707929603, "mean_token_accuracy": 0.7637305855751038, "num_tokens": 9313401.0, "step": 1694, "train/ce_loss": 0.7505602836608887 }, { "epoch": 0.16749060707929603, "step": 1694, "train/sim_loss": 0.0390625 }, { "epoch": 0.16749060707929603, "step": 1694, "train/total_loss": 0.1141185313463211 }, { "entropy": 9.282124519348145, "epoch": 0.16758947992881154, "mean_token_accuracy": 0.703832745552063, "num_tokens": 9318845.0, "step": 1695, "train/ce_loss": 1.011857032775879 }, { "epoch": 0.16758947992881154, "step": 1695, "train/sim_loss": 0.078125 }, { "epoch": 0.16758947992881154, "step": 1695, "train/total_loss": 0.17931070923805237 }, { "entropy": 9.299944877624512, "epoch": 0.16768835277832708, "mean_token_accuracy": 0.7392316460609436, "num_tokens": 9324316.0, "step": 1696, "train/ce_loss": 0.5198056101799011 }, { "epoch": 0.16768835277832708, "step": 1696, "train/sim_loss": 0.05859375 }, { "epoch": 0.16768835277832708, "step": 1696, "train/total_loss": 0.11057431250810623 }, { "entropy": 9.453145980834961, "epoch": 0.1677872256278426, "mean_token_accuracy": 0.7680764198303223, "num_tokens": 9329827.0, "step": 1697, "train/ce_loss": 0.9513021111488342 }, { "epoch": 0.1677872256278426, "step": 1697, "train/sim_loss": 0.0625 }, { "epoch": 0.1677872256278426, "step": 1697, "train/total_loss": 0.15763020515441895 }, { "entropy": 9.353811264038086, "epoch": 0.1678860984773581, "mean_token_accuracy": 0.7262773513793945, "num_tokens": 9335285.0, "step": 1698, "train/ce_loss": 0.7922934889793396 }, { "epoch": 0.1678860984773581, "step": 1698, "train/sim_loss": 0.0859375 }, { "epoch": 0.1678860984773581, "step": 1698, "train/total_loss": 0.16516685485839844 }, { "entropy": 9.240421295166016, "epoch": 0.16798497132687365, "mean_token_accuracy": 0.7639198303222656, "num_tokens": 9340789.0, "step": 1699, "train/ce_loss": 0.6811539530754089 }, { "epoch": 0.16798497132687365, "step": 1699, "train/sim_loss": 0.046875 }, { "epoch": 0.16798497132687365, "step": 1699, "train/total_loss": 0.11499039828777313 }, { "epoch": 0.16808384417638916, "grad_norm": 0.8799428343772888, "learning_rate": 9.582406171191218e-06, "loss": 0.1565, "step": 1700 }, { "entropy": 9.19083023071289, "epoch": 0.16808384417638916, "mean_token_accuracy": 0.7437641620635986, "num_tokens": 9346320.0, "step": 1700, "train/ce_loss": 0.6666135191917419 }, { "epoch": 0.16808384417638916, "step": 1700, "train/sim_loss": 0.03515625 }, { "epoch": 0.16808384417638916, "step": 1700, "train/total_loss": 0.10181760042905807 }, { "entropy": 9.066306114196777, "epoch": 0.16818271702590468, "mean_token_accuracy": 0.690499484539032, "num_tokens": 9351972.0, "step": 1701, "train/ce_loss": 0.7184156775474548 }, { "epoch": 0.16818271702590468, "step": 1701, "train/sim_loss": 0.0546875 }, { "epoch": 0.16818271702590468, "step": 1701, "train/total_loss": 0.12652906775474548 }, { "entropy": 9.02415657043457, "epoch": 0.16828158987542022, "mean_token_accuracy": 0.6930232644081116, "num_tokens": 9357448.0, "step": 1702, "train/ce_loss": 0.9707816243171692 }, { "epoch": 0.16828158987542022, "step": 1702, "train/sim_loss": 0.08984375 }, { "epoch": 0.16828158987542022, "step": 1702, "train/total_loss": 0.18692192435264587 }, { "entropy": 8.817510604858398, "epoch": 0.16838046272493573, "mean_token_accuracy": 0.7070116996765137, "num_tokens": 9363159.0, "step": 1703, "train/ce_loss": 1.5747179985046387 }, { "epoch": 0.16838046272493573, "step": 1703, "train/sim_loss": 0.06640625 }, { "epoch": 0.16838046272493573, "step": 1703, "train/total_loss": 0.22387805581092834 }, { "entropy": 9.668912887573242, "epoch": 0.16847933557445124, "mean_token_accuracy": 0.7418856024742126, "num_tokens": 9368370.0, "step": 1704, "train/ce_loss": 0.7772313952445984 }, { "epoch": 0.16847933557445124, "step": 1704, "train/sim_loss": 0.06640625 }, { "epoch": 0.16847933557445124, "step": 1704, "train/total_loss": 0.14412939548492432 }, { "entropy": 9.008167266845703, "epoch": 0.16857820842396679, "mean_token_accuracy": 0.7538610100746155, "num_tokens": 9374085.0, "step": 1705, "train/ce_loss": 0.5819916725158691 }, { "epoch": 0.16857820842396679, "step": 1705, "train/sim_loss": 0.09765625 }, { "epoch": 0.16857820842396679, "step": 1705, "train/total_loss": 0.15585541725158691 }, { "entropy": 9.305802345275879, "epoch": 0.1686770812734823, "mean_token_accuracy": 0.7041620016098022, "num_tokens": 9379536.0, "step": 1706, "train/ce_loss": 1.9663572311401367 }, { "epoch": 0.1686770812734823, "step": 1706, "train/sim_loss": 0.0703125 }, { "epoch": 0.1686770812734823, "step": 1706, "train/total_loss": 0.26694822311401367 }, { "entropy": 9.130168914794922, "epoch": 0.1687759541229978, "mean_token_accuracy": 0.7045235633850098, "num_tokens": 9385161.0, "step": 1707, "train/ce_loss": 0.6859099268913269 }, { "epoch": 0.1687759541229978, "step": 1707, "train/sim_loss": 0.1171875 }, { "epoch": 0.1687759541229978, "step": 1707, "train/total_loss": 0.18577849864959717 }, { "entropy": 9.6988525390625, "epoch": 0.16887482697251335, "mean_token_accuracy": 0.6751117706298828, "num_tokens": 9390334.0, "step": 1708, "train/ce_loss": 1.2551432847976685 }, { "epoch": 0.16887482697251335, "step": 1708, "train/sim_loss": 0.078125 }, { "epoch": 0.16887482697251335, "step": 1708, "train/total_loss": 0.20363932847976685 }, { "entropy": 9.63426685333252, "epoch": 0.16897369982202887, "mean_token_accuracy": 0.7417840361595154, "num_tokens": 9395605.0, "step": 1709, "train/ce_loss": 0.6414210200309753 }, { "epoch": 0.16897369982202887, "step": 1709, "train/sim_loss": 0.0625 }, { "epoch": 0.16897369982202887, "step": 1709, "train/total_loss": 0.126642107963562 }, { "entropy": 9.167936325073242, "epoch": 0.1690725726715444, "mean_token_accuracy": 0.7319587469100952, "num_tokens": 9401057.0, "step": 1710, "train/ce_loss": 0.8835290670394897 }, { "epoch": 0.1690725726715444, "step": 1710, "train/sim_loss": 0.109375 }, { "epoch": 0.1690725726715444, "step": 1710, "train/total_loss": 0.19772791862487793 }, { "entropy": 9.446664810180664, "epoch": 0.16917144552105992, "mean_token_accuracy": 0.8119551539421082, "num_tokens": 9406348.0, "step": 1711, "train/ce_loss": 0.415195494890213 }, { "epoch": 0.16917144552105992, "step": 1711, "train/sim_loss": 0.05859375 }, { "epoch": 0.16917144552105992, "step": 1711, "train/total_loss": 0.10011330246925354 }, { "entropy": 9.276082992553711, "epoch": 0.16927031837057543, "mean_token_accuracy": 0.757785439491272, "num_tokens": 9411831.0, "step": 1712, "train/ce_loss": 0.9539769291877747 }, { "epoch": 0.16927031837057543, "step": 1712, "train/sim_loss": 0.08203125 }, { "epoch": 0.16927031837057543, "step": 1712, "train/total_loss": 0.1774289458990097 }, { "entropy": 9.233299255371094, "epoch": 0.16936919122009098, "mean_token_accuracy": 0.7603121399879456, "num_tokens": 9417364.0, "step": 1713, "train/ce_loss": 0.5413237810134888 }, { "epoch": 0.16936919122009098, "step": 1713, "train/sim_loss": 0.06640625 }, { "epoch": 0.16936919122009098, "step": 1713, "train/total_loss": 0.120538629591465 }, { "entropy": 8.911981582641602, "epoch": 0.1694680640696065, "mean_token_accuracy": 0.7488372325897217, "num_tokens": 9423064.0, "step": 1714, "train/ce_loss": 0.4801175892353058 }, { "epoch": 0.1694680640696065, "step": 1714, "train/sim_loss": 0.078125 }, { "epoch": 0.1694680640696065, "step": 1714, "train/total_loss": 0.12613676488399506 }, { "entropy": 9.272786140441895, "epoch": 0.169566936919122, "mean_token_accuracy": 0.7471697926521301, "num_tokens": 9428495.0, "step": 1715, "train/ce_loss": 0.826190173625946 }, { "epoch": 0.169566936919122, "step": 1715, "train/sim_loss": 0.04296875 }, { "epoch": 0.169566936919122, "step": 1715, "train/total_loss": 0.12558776140213013 }, { "entropy": 9.354549407958984, "epoch": 0.16966580976863754, "mean_token_accuracy": 0.7551546096801758, "num_tokens": 9433877.0, "step": 1716, "train/ce_loss": 0.682782769203186 }, { "epoch": 0.16966580976863754, "step": 1716, "train/sim_loss": 0.07421875 }, { "epoch": 0.16966580976863754, "step": 1716, "train/total_loss": 0.14249703288078308 }, { "entropy": 9.669357299804688, "epoch": 0.16976468261815306, "mean_token_accuracy": 0.7768595218658447, "num_tokens": 9439087.0, "step": 1717, "train/ce_loss": 0.96755450963974 }, { "epoch": 0.16976468261815306, "step": 1717, "train/sim_loss": 0.03125 }, { "epoch": 0.16976468261815306, "step": 1717, "train/total_loss": 0.12800544500350952 }, { "entropy": 9.371861457824707, "epoch": 0.16986355546766857, "mean_token_accuracy": 0.7626903653144836, "num_tokens": 9444514.0, "step": 1718, "train/ce_loss": 0.9408624172210693 }, { "epoch": 0.16986355546766857, "step": 1718, "train/sim_loss": 0.08984375 }, { "epoch": 0.16986355546766857, "step": 1718, "train/total_loss": 0.18392999470233917 }, { "entropy": 9.579764366149902, "epoch": 0.1699624283171841, "mean_token_accuracy": 0.7503429651260376, "num_tokens": 9449844.0, "step": 1719, "train/ce_loss": 0.4774896204471588 }, { "epoch": 0.1699624283171841, "step": 1719, "train/sim_loss": 0.109375 }, { "epoch": 0.1699624283171841, "step": 1719, "train/total_loss": 0.15712396800518036 }, { "epoch": 0.17006130116669962, "grad_norm": 0.9025325775146484, "learning_rate": 9.57746130643327e-06, "loss": 0.1632, "step": 1720 }, { "entropy": 9.271869659423828, "epoch": 0.17006130116669962, "mean_token_accuracy": 0.718343198299408, "num_tokens": 9455336.0, "step": 1720, "train/ce_loss": 0.6991035342216492 }, { "epoch": 0.17006130116669962, "step": 1720, "train/sim_loss": 0.0703125 }, { "epoch": 0.17006130116669962, "step": 1720, "train/total_loss": 0.14022284746170044 }, { "entropy": 8.948516845703125, "epoch": 0.17016017401621514, "mean_token_accuracy": 0.7432306408882141, "num_tokens": 9461079.0, "step": 1721, "train/ce_loss": 0.570199191570282 }, { "epoch": 0.17016017401621514, "step": 1721, "train/sim_loss": 0.03515625 }, { "epoch": 0.17016017401621514, "step": 1721, "train/total_loss": 0.0921761691570282 }, { "entropy": 9.144454956054688, "epoch": 0.17025904686573068, "mean_token_accuracy": 0.7075055241584778, "num_tokens": 9466540.0, "step": 1722, "train/ce_loss": 0.39765071868896484 }, { "epoch": 0.17025904686573068, "step": 1722, "train/sim_loss": 0.06640625 }, { "epoch": 0.17025904686573068, "step": 1722, "train/total_loss": 0.10617132484912872 }, { "entropy": 9.009627342224121, "epoch": 0.1703579197152462, "mean_token_accuracy": 0.7237113118171692, "num_tokens": 9472085.0, "step": 1723, "train/ce_loss": 1.4744598865509033 }, { "epoch": 0.1703579197152462, "step": 1723, "train/sim_loss": 0.05859375 }, { "epoch": 0.1703579197152462, "step": 1723, "train/total_loss": 0.20603974163532257 }, { "entropy": 8.965954780578613, "epoch": 0.1704567925647617, "mean_token_accuracy": 0.7626373767852783, "num_tokens": 9477622.0, "step": 1724, "train/ce_loss": 0.7781122326850891 }, { "epoch": 0.1704567925647617, "step": 1724, "train/sim_loss": 0.07421875 }, { "epoch": 0.1704567925647617, "step": 1724, "train/total_loss": 0.15202997624874115 }, { "entropy": 9.043607711791992, "epoch": 0.17055566541427725, "mean_token_accuracy": 0.7173184156417847, "num_tokens": 9483076.0, "step": 1725, "train/ce_loss": 0.516547441482544 }, { "epoch": 0.17055566541427725, "step": 1725, "train/sim_loss": 0.06640625 }, { "epoch": 0.17055566541427725, "step": 1725, "train/total_loss": 0.11806099116802216 }, { "entropy": 8.75768756866455, "epoch": 0.17065453826379276, "mean_token_accuracy": 0.7042869925498962, "num_tokens": 9488848.0, "step": 1726, "train/ce_loss": 1.7951922416687012 }, { "epoch": 0.17065453826379276, "step": 1726, "train/sim_loss": 0.046875 }, { "epoch": 0.17065453826379276, "step": 1726, "train/total_loss": 0.22639422118663788 }, { "entropy": 9.55865478515625, "epoch": 0.17075341111330827, "mean_token_accuracy": 0.7279305458068848, "num_tokens": 9494118.0, "step": 1727, "train/ce_loss": 0.5668206214904785 }, { "epoch": 0.17075341111330827, "step": 1727, "train/sim_loss": 0.07421875 }, { "epoch": 0.17075341111330827, "step": 1727, "train/total_loss": 0.1309008151292801 }, { "entropy": 9.164319038391113, "epoch": 0.17085228396282381, "mean_token_accuracy": 0.7250000238418579, "num_tokens": 9499691.0, "step": 1728, "train/ce_loss": 1.766099452972412 }, { "epoch": 0.17085228396282381, "step": 1728, "train/sim_loss": 0.078125 }, { "epoch": 0.17085228396282381, "step": 1728, "train/total_loss": 0.25473493337631226 }, { "entropy": 8.707769393920898, "epoch": 0.17095115681233933, "mean_token_accuracy": 0.6882129311561584, "num_tokens": 9505368.0, "step": 1729, "train/ce_loss": 0.7112627625465393 }, { "epoch": 0.17095115681233933, "step": 1729, "train/sim_loss": 0.125 }, { "epoch": 0.17095115681233933, "step": 1729, "train/total_loss": 0.1961262822151184 }, { "entropy": 8.7677583694458, "epoch": 0.17105002966185484, "mean_token_accuracy": 0.8116232752799988, "num_tokens": 9511065.0, "step": 1730, "train/ce_loss": 0.48834967613220215 }, { "epoch": 0.17105002966185484, "step": 1730, "train/sim_loss": 0.03125 }, { "epoch": 0.17105002966185484, "step": 1730, "train/total_loss": 0.08008496463298798 }, { "entropy": 9.208024024963379, "epoch": 0.17114890251137038, "mean_token_accuracy": 0.7232037782669067, "num_tokens": 9516521.0, "step": 1731, "train/ce_loss": 0.7271567583084106 }, { "epoch": 0.17114890251137038, "step": 1731, "train/sim_loss": 0.07421875 }, { "epoch": 0.17114890251137038, "step": 1731, "train/total_loss": 0.1469344198703766 }, { "entropy": 9.14810562133789, "epoch": 0.1712477753608859, "mean_token_accuracy": 0.7180851101875305, "num_tokens": 9522085.0, "step": 1732, "train/ce_loss": 1.0752151012420654 }, { "epoch": 0.1712477753608859, "step": 1732, "train/sim_loss": 0.0859375 }, { "epoch": 0.1712477753608859, "step": 1732, "train/total_loss": 0.19345900416374207 }, { "entropy": 9.02522087097168, "epoch": 0.17134664821040144, "mean_token_accuracy": 0.7431761622428894, "num_tokens": 9527577.0, "step": 1733, "train/ce_loss": 1.0068392753601074 }, { "epoch": 0.17134664821040144, "step": 1733, "train/sim_loss": 0.09765625 }, { "epoch": 0.17134664821040144, "step": 1733, "train/total_loss": 0.19834017753601074 }, { "entropy": 9.313077926635742, "epoch": 0.17144552105991695, "mean_token_accuracy": 0.8048780560493469, "num_tokens": 9533015.0, "step": 1734, "train/ce_loss": 0.4979565143585205 }, { "epoch": 0.17144552105991695, "step": 1734, "train/sim_loss": 0.03125 }, { "epoch": 0.17144552105991695, "step": 1734, "train/total_loss": 0.08104565739631653 }, { "entropy": 9.450675964355469, "epoch": 0.17154439390943246, "mean_token_accuracy": 0.7414012551307678, "num_tokens": 9538348.0, "step": 1735, "train/ce_loss": 0.9563649296760559 }, { "epoch": 0.17154439390943246, "step": 1735, "train/sim_loss": 0.0546875 }, { "epoch": 0.17154439390943246, "step": 1735, "train/total_loss": 0.1503239870071411 }, { "entropy": 9.112768173217773, "epoch": 0.171643266758948, "mean_token_accuracy": 0.787104606628418, "num_tokens": 9543859.0, "step": 1736, "train/ce_loss": 0.48079249262809753 }, { "epoch": 0.171643266758948, "step": 1736, "train/sim_loss": 0.0390625 }, { "epoch": 0.171643266758948, "step": 1736, "train/total_loss": 0.08714175224304199 }, { "entropy": 9.075857162475586, "epoch": 0.17174213960846352, "mean_token_accuracy": 0.7319915294647217, "num_tokens": 9549433.0, "step": 1737, "train/ce_loss": 0.49439582228660583 }, { "epoch": 0.17174213960846352, "step": 1737, "train/sim_loss": 0.078125 }, { "epoch": 0.17174213960846352, "step": 1737, "train/total_loss": 0.12756457924842834 }, { "entropy": 9.483123779296875, "epoch": 0.17184101245797903, "mean_token_accuracy": 0.7772925496101379, "num_tokens": 9554770.0, "step": 1738, "train/ce_loss": 0.5405584573745728 }, { "epoch": 0.17184101245797903, "step": 1738, "train/sim_loss": 0.078125 }, { "epoch": 0.17184101245797903, "step": 1738, "train/total_loss": 0.1321808397769928 }, { "entropy": 9.308540344238281, "epoch": 0.17193988530749457, "mean_token_accuracy": 0.6646562218666077, "num_tokens": 9560266.0, "step": 1739, "train/ce_loss": 0.9868186116218567 }, { "epoch": 0.17193988530749457, "step": 1739, "train/sim_loss": 0.08984375 }, { "epoch": 0.17193988530749457, "step": 1739, "train/total_loss": 0.18852561712265015 }, { "epoch": 0.17203875815701009, "grad_norm": 1.189910650253296, "learning_rate": 9.57251644167532e-06, "loss": 0.1602, "step": 1740 }, { "entropy": 9.334311485290527, "epoch": 0.17203875815701009, "mean_token_accuracy": 0.73235684633255, "num_tokens": 9565666.0, "step": 1740, "train/ce_loss": 0.7928887605667114 }, { "epoch": 0.17203875815701009, "step": 1740, "train/sim_loss": 0.0703125 }, { "epoch": 0.17203875815701009, "step": 1740, "train/total_loss": 0.14960137009620667 }, { "entropy": 9.136784553527832, "epoch": 0.1721376310065256, "mean_token_accuracy": 0.6906077265739441, "num_tokens": 9571175.0, "step": 1741, "train/ce_loss": 0.27960750460624695 }, { "epoch": 0.1721376310065256, "step": 1741, "train/sim_loss": 0.0625 }, { "epoch": 0.1721376310065256, "step": 1741, "train/total_loss": 0.09046074748039246 }, { "entropy": 9.057108879089355, "epoch": 0.17223650385604114, "mean_token_accuracy": 0.7391732335090637, "num_tokens": 9576774.0, "step": 1742, "train/ce_loss": 0.3893411457538605 }, { "epoch": 0.17223650385604114, "step": 1742, "train/sim_loss": 0.07421875 }, { "epoch": 0.17223650385604114, "step": 1742, "train/total_loss": 0.11315286159515381 }, { "entropy": 9.304640769958496, "epoch": 0.17233537670555665, "mean_token_accuracy": 0.7557160258293152, "num_tokens": 9582184.0, "step": 1743, "train/ce_loss": 0.40707871317863464 }, { "epoch": 0.17233537670555665, "step": 1743, "train/sim_loss": 0.0703125 }, { "epoch": 0.17233537670555665, "step": 1743, "train/total_loss": 0.11102037131786346 }, { "entropy": 9.557701110839844, "epoch": 0.17243424955507217, "mean_token_accuracy": 0.7441217303276062, "num_tokens": 9587454.0, "step": 1744, "train/ce_loss": 1.2287436723709106 }, { "epoch": 0.17243424955507217, "step": 1744, "train/sim_loss": 0.07421875 }, { "epoch": 0.17243424955507217, "step": 1744, "train/total_loss": 0.19709312915802002 }, { "entropy": 8.996572494506836, "epoch": 0.1725331224045877, "mean_token_accuracy": 0.7257525324821472, "num_tokens": 9592988.0, "step": 1745, "train/ce_loss": 0.6745277047157288 }, { "epoch": 0.1725331224045877, "step": 1745, "train/sim_loss": 0.06640625 }, { "epoch": 0.1725331224045877, "step": 1745, "train/total_loss": 0.13385902345180511 }, { "entropy": 9.539762496948242, "epoch": 0.17263199525410322, "mean_token_accuracy": 0.7635402679443359, "num_tokens": 9598531.0, "step": 1746, "train/ce_loss": 0.6932487487792969 }, { "epoch": 0.17263199525410322, "step": 1746, "train/sim_loss": 0.06640625 }, { "epoch": 0.17263199525410322, "step": 1746, "train/total_loss": 0.13573113083839417 }, { "entropy": 8.623954772949219, "epoch": 0.17273086810361873, "mean_token_accuracy": 0.7362637519836426, "num_tokens": 9604462.0, "step": 1747, "train/ce_loss": 0.454692542552948 }, { "epoch": 0.17273086810361873, "step": 1747, "train/sim_loss": 0.1484375 }, { "epoch": 0.17273086810361873, "step": 1747, "train/total_loss": 0.1939067542552948 }, { "entropy": 9.266886711120605, "epoch": 0.17282974095313428, "mean_token_accuracy": 0.7408313155174255, "num_tokens": 9609927.0, "step": 1748, "train/ce_loss": 0.5112321972846985 }, { "epoch": 0.17282974095313428, "step": 1748, "train/sim_loss": 0.046875 }, { "epoch": 0.17282974095313428, "step": 1748, "train/total_loss": 0.09799821674823761 }, { "entropy": 8.926322937011719, "epoch": 0.1729286138026498, "mean_token_accuracy": 0.7008733749389648, "num_tokens": 9615600.0, "step": 1749, "train/ce_loss": 0.7347846031188965 }, { "epoch": 0.1729286138026498, "step": 1749, "train/sim_loss": 0.10546875 }, { "epoch": 0.1729286138026498, "step": 1749, "train/total_loss": 0.17894721031188965 }, { "entropy": 8.815605163574219, "epoch": 0.1730274866521653, "mean_token_accuracy": 0.743852436542511, "num_tokens": 9621225.0, "step": 1750, "train/ce_loss": 0.7988265156745911 }, { "epoch": 0.1730274866521653, "step": 1750, "train/sim_loss": 0.1015625 }, { "epoch": 0.1730274866521653, "step": 1750, "train/total_loss": 0.1814451515674591 }, { "entropy": 9.206015586853027, "epoch": 0.17312635950168084, "mean_token_accuracy": 0.7550111413002014, "num_tokens": 9626718.0, "step": 1751, "train/ce_loss": 0.8918145298957825 }, { "epoch": 0.17312635950168084, "step": 1751, "train/sim_loss": 0.05078125 }, { "epoch": 0.17312635950168084, "step": 1751, "train/total_loss": 0.13996270298957825 }, { "entropy": 9.444564819335938, "epoch": 0.17322523235119636, "mean_token_accuracy": 0.7367706894874573, "num_tokens": 9632058.0, "step": 1752, "train/ce_loss": 0.6827493906021118 }, { "epoch": 0.17322523235119636, "step": 1752, "train/sim_loss": 0.078125 }, { "epoch": 0.17322523235119636, "step": 1752, "train/total_loss": 0.14639994502067566 }, { "entropy": 9.231491088867188, "epoch": 0.1733241052007119, "mean_token_accuracy": 0.7177321910858154, "num_tokens": 9637503.0, "step": 1753, "train/ce_loss": 0.7255060076713562 }, { "epoch": 0.1733241052007119, "step": 1753, "train/sim_loss": 0.08203125 }, { "epoch": 0.1733241052007119, "step": 1753, "train/total_loss": 0.15458184480667114 }, { "entropy": 9.48372745513916, "epoch": 0.1734229780502274, "mean_token_accuracy": 0.6876640319824219, "num_tokens": 9642869.0, "step": 1754, "train/ce_loss": 0.6230818033218384 }, { "epoch": 0.1734229780502274, "step": 1754, "train/sim_loss": 0.11328125 }, { "epoch": 0.1734229780502274, "step": 1754, "train/total_loss": 0.1755894273519516 }, { "entropy": 9.148332595825195, "epoch": 0.17352185089974292, "mean_token_accuracy": 0.7344444394111633, "num_tokens": 9648459.0, "step": 1755, "train/ce_loss": 0.33028122782707214 }, { "epoch": 0.17352185089974292, "step": 1755, "train/sim_loss": 0.0859375 }, { "epoch": 0.17352185089974292, "step": 1755, "train/total_loss": 0.11896562576293945 }, { "entropy": 9.432771682739258, "epoch": 0.17362072374925847, "mean_token_accuracy": 0.7553324699401855, "num_tokens": 9653817.0, "step": 1756, "train/ce_loss": 0.7303709387779236 }, { "epoch": 0.17362072374925847, "step": 1756, "train/sim_loss": 0.0703125 }, { "epoch": 0.17362072374925847, "step": 1756, "train/total_loss": 0.14334958791732788 }, { "entropy": 9.215439796447754, "epoch": 0.17371959659877398, "mean_token_accuracy": 0.7352555990219116, "num_tokens": 9659310.0, "step": 1757, "train/ce_loss": 0.70037442445755 }, { "epoch": 0.17371959659877398, "step": 1757, "train/sim_loss": 0.078125 }, { "epoch": 0.17371959659877398, "step": 1757, "train/total_loss": 0.14816245436668396 }, { "entropy": 9.3016357421875, "epoch": 0.1738184694482895, "mean_token_accuracy": 0.7326507568359375, "num_tokens": 9664763.0, "step": 1758, "train/ce_loss": 0.590712308883667 }, { "epoch": 0.1738184694482895, "step": 1758, "train/sim_loss": 0.0859375 }, { "epoch": 0.1738184694482895, "step": 1758, "train/total_loss": 0.14500872790813446 }, { "entropy": 9.039403915405273, "epoch": 0.17391734229780503, "mean_token_accuracy": 0.7815126180648804, "num_tokens": 9670383.0, "step": 1759, "train/ce_loss": 0.35507792234420776 }, { "epoch": 0.17391734229780503, "step": 1759, "train/sim_loss": 0.07421875 }, { "epoch": 0.17391734229780503, "step": 1759, "train/total_loss": 0.10972654819488525 }, { "epoch": 0.17401621514732055, "grad_norm": 0.8320774435997009, "learning_rate": 9.567571576917371e-06, "loss": 0.1654, "step": 1760 }, { "entropy": 9.408206939697266, "epoch": 0.17401621514732055, "mean_token_accuracy": 0.7540372610092163, "num_tokens": 9675729.0, "step": 1760, "train/ce_loss": 1.243815302848816 }, { "epoch": 0.17401621514732055, "step": 1760, "train/sim_loss": 0.06640625 }, { "epoch": 0.17401621514732055, "step": 1760, "train/total_loss": 0.19078779220581055 }, { "entropy": 9.371319770812988, "epoch": 0.17411508799683606, "mean_token_accuracy": 0.7873183488845825, "num_tokens": 9681176.0, "step": 1761, "train/ce_loss": 0.7133687734603882 }, { "epoch": 0.17411508799683606, "step": 1761, "train/sim_loss": 0.06640625 }, { "epoch": 0.17411508799683606, "step": 1761, "train/total_loss": 0.13774313032627106 }, { "entropy": 9.284880638122559, "epoch": 0.1742139608463516, "mean_token_accuracy": 0.7663551568984985, "num_tokens": 9686654.0, "step": 1762, "train/ce_loss": 0.3664151132106781 }, { "epoch": 0.1742139608463516, "step": 1762, "train/sim_loss": 0.078125 }, { "epoch": 0.1742139608463516, "step": 1762, "train/total_loss": 0.11476650834083557 }, { "entropy": 8.887441635131836, "epoch": 0.17431283369586711, "mean_token_accuracy": 0.7316433787345886, "num_tokens": 9692559.0, "step": 1763, "train/ce_loss": 0.8869920372962952 }, { "epoch": 0.17431283369586711, "step": 1763, "train/sim_loss": 0.078125 }, { "epoch": 0.17431283369586711, "step": 1763, "train/total_loss": 0.16682420670986176 }, { "entropy": 8.9581298828125, "epoch": 0.17441170654538263, "mean_token_accuracy": 0.7356746792793274, "num_tokens": 9698154.0, "step": 1764, "train/ce_loss": 0.3349401652812958 }, { "epoch": 0.17441170654538263, "step": 1764, "train/sim_loss": 0.0234375 }, { "epoch": 0.17441170654538263, "step": 1764, "train/total_loss": 0.0569315180182457 }, { "entropy": 9.281572341918945, "epoch": 0.17451057939489817, "mean_token_accuracy": 0.75, "num_tokens": 9703592.0, "step": 1765, "train/ce_loss": 1.0483472347259521 }, { "epoch": 0.17451057939489817, "step": 1765, "train/sim_loss": 0.09375 }, { "epoch": 0.17451057939489817, "step": 1765, "train/total_loss": 0.19858473539352417 }, { "entropy": 9.220732688903809, "epoch": 0.17460945224441368, "mean_token_accuracy": 0.8025851845741272, "num_tokens": 9709004.0, "step": 1766, "train/ce_loss": 0.5480649471282959 }, { "epoch": 0.17460945224441368, "step": 1766, "train/sim_loss": 0.04296875 }, { "epoch": 0.17460945224441368, "step": 1766, "train/total_loss": 0.09777525067329407 }, { "entropy": 8.879413604736328, "epoch": 0.1747083250939292, "mean_token_accuracy": 0.7103658318519592, "num_tokens": 9714576.0, "step": 1767, "train/ce_loss": 0.9534274339675903 }, { "epoch": 0.1747083250939292, "step": 1767, "train/sim_loss": 0.25390625 }, { "epoch": 0.1747083250939292, "step": 1767, "train/total_loss": 0.349249005317688 }, { "entropy": 9.269755363464355, "epoch": 0.17480719794344474, "mean_token_accuracy": 0.7290249466896057, "num_tokens": 9719976.0, "step": 1768, "train/ce_loss": 0.9241123199462891 }, { "epoch": 0.17480719794344474, "step": 1768, "train/sim_loss": 0.0390625 }, { "epoch": 0.17480719794344474, "step": 1768, "train/total_loss": 0.13147373497486115 }, { "entropy": 9.2384033203125, "epoch": 0.17490607079296025, "mean_token_accuracy": 0.7707082629203796, "num_tokens": 9725350.0, "step": 1769, "train/ce_loss": 0.9413095116615295 }, { "epoch": 0.17490607079296025, "step": 1769, "train/sim_loss": 0.08203125 }, { "epoch": 0.17490607079296025, "step": 1769, "train/total_loss": 0.1761622130870819 }, { "entropy": 9.290754318237305, "epoch": 0.17500494364247576, "mean_token_accuracy": 0.7188295125961304, "num_tokens": 9730720.0, "step": 1770, "train/ce_loss": 0.5578386783599854 }, { "epoch": 0.17500494364247576, "step": 1770, "train/sim_loss": 0.0703125 }, { "epoch": 0.17500494364247576, "step": 1770, "train/total_loss": 0.12609636783599854 }, { "entropy": 9.445374488830566, "epoch": 0.1751038164919913, "mean_token_accuracy": 0.7595628499984741, "num_tokens": 9736090.0, "step": 1771, "train/ce_loss": 0.3721587061882019 }, { "epoch": 0.1751038164919913, "step": 1771, "train/sim_loss": 0.1171875 }, { "epoch": 0.1751038164919913, "step": 1771, "train/total_loss": 0.15440337359905243 }, { "entropy": 9.119098663330078, "epoch": 0.17520268934150682, "mean_token_accuracy": 0.7005405426025391, "num_tokens": 9741639.0, "step": 1772, "train/ce_loss": 0.6550112962722778 }, { "epoch": 0.17520268934150682, "step": 1772, "train/sim_loss": 0.05859375 }, { "epoch": 0.17520268934150682, "step": 1772, "train/total_loss": 0.1240948811173439 }, { "entropy": 9.25763988494873, "epoch": 0.17530156219102236, "mean_token_accuracy": 0.7690417766571045, "num_tokens": 9747084.0, "step": 1773, "train/ce_loss": 0.9380912780761719 }, { "epoch": 0.17530156219102236, "step": 1773, "train/sim_loss": 0.09375 }, { "epoch": 0.17530156219102236, "step": 1773, "train/total_loss": 0.1875591278076172 }, { "entropy": 9.264333724975586, "epoch": 0.17540043504053787, "mean_token_accuracy": 0.7355278134346008, "num_tokens": 9752554.0, "step": 1774, "train/ce_loss": 1.0266929864883423 }, { "epoch": 0.17540043504053787, "step": 1774, "train/sim_loss": 0.08984375 }, { "epoch": 0.17540043504053787, "step": 1774, "train/total_loss": 0.19251304864883423 }, { "entropy": 9.078041076660156, "epoch": 0.17549930789005339, "mean_token_accuracy": 0.6844305396080017, "num_tokens": 9758186.0, "step": 1775, "train/ce_loss": 0.6231216192245483 }, { "epoch": 0.17549930789005339, "step": 1775, "train/sim_loss": 0.09765625 }, { "epoch": 0.17549930789005339, "step": 1775, "train/total_loss": 0.15996840596199036 }, { "entropy": 9.281214714050293, "epoch": 0.17559818073956893, "mean_token_accuracy": 0.75, "num_tokens": 9763605.0, "step": 1776, "train/ce_loss": 0.9836775660514832 }, { "epoch": 0.17559818073956893, "step": 1776, "train/sim_loss": 0.078125 }, { "epoch": 0.17559818073956893, "step": 1776, "train/total_loss": 0.17649275064468384 }, { "entropy": 9.15318775177002, "epoch": 0.17569705358908444, "mean_token_accuracy": 0.7029914259910583, "num_tokens": 9769156.0, "step": 1777, "train/ce_loss": 0.930944561958313 }, { "epoch": 0.17569705358908444, "step": 1777, "train/sim_loss": 0.05078125 }, { "epoch": 0.17569705358908444, "step": 1777, "train/total_loss": 0.14387571811676025 }, { "entropy": 9.045417785644531, "epoch": 0.17579592643859995, "mean_token_accuracy": 0.7389006614685059, "num_tokens": 9774765.0, "step": 1778, "train/ce_loss": 0.7955159544944763 }, { "epoch": 0.17579592643859995, "step": 1778, "train/sim_loss": 0.11328125 }, { "epoch": 0.17579592643859995, "step": 1778, "train/total_loss": 0.1928328573703766 }, { "entropy": 9.227349281311035, "epoch": 0.1758947992881155, "mean_token_accuracy": 0.8095238208770752, "num_tokens": 9780213.0, "step": 1779, "train/ce_loss": 0.6439024806022644 }, { "epoch": 0.1758947992881155, "step": 1779, "train/sim_loss": 0.03125 }, { "epoch": 0.1758947992881155, "step": 1779, "train/total_loss": 0.09564024955034256 }, { "epoch": 0.175993672137631, "grad_norm": 0.7326927781105042, "learning_rate": 9.562626712159424e-06, "loss": 0.152, "step": 1780 }, { "entropy": 9.203082084655762, "epoch": 0.175993672137631, "mean_token_accuracy": 0.7753086686134338, "num_tokens": 9785646.0, "step": 1780, "train/ce_loss": 0.7712109684944153 }, { "epoch": 0.175993672137631, "step": 1780, "train/sim_loss": 0.12109375 }, { "epoch": 0.175993672137631, "step": 1780, "train/total_loss": 0.19821485877037048 }, { "entropy": 9.342802047729492, "epoch": 0.17609254498714652, "mean_token_accuracy": 0.7609289884567261, "num_tokens": 9791048.0, "step": 1781, "train/ce_loss": 0.45253849029541016 }, { "epoch": 0.17609254498714652, "step": 1781, "train/sim_loss": 0.07421875 }, { "epoch": 0.17609254498714652, "step": 1781, "train/total_loss": 0.11947260051965714 }, { "entropy": 8.94689655303955, "epoch": 0.17619141783666206, "mean_token_accuracy": 0.7386877536773682, "num_tokens": 9796538.0, "step": 1782, "train/ce_loss": 1.203294038772583 }, { "epoch": 0.17619141783666206, "step": 1782, "train/sim_loss": 0.05859375 }, { "epoch": 0.17619141783666206, "step": 1782, "train/total_loss": 0.17892315983772278 }, { "entropy": 9.41569709777832, "epoch": 0.17629029068617758, "mean_token_accuracy": 0.7008652687072754, "num_tokens": 9801952.0, "step": 1783, "train/ce_loss": 1.0502218008041382 }, { "epoch": 0.17629029068617758, "step": 1783, "train/sim_loss": 0.10546875 }, { "epoch": 0.17629029068617758, "step": 1783, "train/total_loss": 0.21049094200134277 }, { "entropy": 9.43376350402832, "epoch": 0.1763891635356931, "mean_token_accuracy": 0.75, "num_tokens": 9807238.0, "step": 1784, "train/ce_loss": 1.4428889751434326 }, { "epoch": 0.1763891635356931, "step": 1784, "train/sim_loss": 0.08203125 }, { "epoch": 0.1763891635356931, "step": 1784, "train/total_loss": 0.22632014751434326 }, { "entropy": 9.027387619018555, "epoch": 0.17648803638520863, "mean_token_accuracy": 0.7864077687263489, "num_tokens": 9812821.0, "step": 1785, "train/ce_loss": 0.7178101539611816 }, { "epoch": 0.17648803638520863, "step": 1785, "train/sim_loss": 0.109375 }, { "epoch": 0.17648803638520863, "step": 1785, "train/total_loss": 0.1811560094356537 }, { "entropy": 9.406821250915527, "epoch": 0.17658690923472414, "mean_token_accuracy": 0.7280108332633972, "num_tokens": 9818176.0, "step": 1786, "train/ce_loss": 0.5969212651252747 }, { "epoch": 0.17658690923472414, "step": 1786, "train/sim_loss": 0.06640625 }, { "epoch": 0.17658690923472414, "step": 1786, "train/total_loss": 0.1260983794927597 }, { "entropy": 9.526542663574219, "epoch": 0.17668578208423966, "mean_token_accuracy": 0.7403225898742676, "num_tokens": 9823419.0, "step": 1787, "train/ce_loss": 1.1806479692459106 }, { "epoch": 0.17668578208423966, "step": 1787, "train/sim_loss": 0.0859375 }, { "epoch": 0.17668578208423966, "step": 1787, "train/total_loss": 0.2040022909641266 }, { "entropy": 9.616584777832031, "epoch": 0.1767846549337552, "mean_token_accuracy": 0.7354260087013245, "num_tokens": 9828652.0, "step": 1788, "train/ce_loss": 0.9177260398864746 }, { "epoch": 0.1767846549337552, "step": 1788, "train/sim_loss": 0.046875 }, { "epoch": 0.1767846549337552, "step": 1788, "train/total_loss": 0.13864761590957642 }, { "entropy": 9.118926048278809, "epoch": 0.1768835277832707, "mean_token_accuracy": 0.7442105412483215, "num_tokens": 9834258.0, "step": 1789, "train/ce_loss": 0.6955993175506592 }, { "epoch": 0.1768835277832707, "step": 1789, "train/sim_loss": 0.05859375 }, { "epoch": 0.1768835277832707, "step": 1789, "train/total_loss": 0.12815368175506592 }, { "entropy": 9.264177322387695, "epoch": 0.17698240063278622, "mean_token_accuracy": 0.7232704162597656, "num_tokens": 9839646.0, "step": 1790, "train/ce_loss": 0.6712148189544678 }, { "epoch": 0.17698240063278622, "step": 1790, "train/sim_loss": 0.08203125 }, { "epoch": 0.17698240063278622, "step": 1790, "train/total_loss": 0.1491527259349823 }, { "entropy": 9.316158294677734, "epoch": 0.17708127348230177, "mean_token_accuracy": 0.8084577322006226, "num_tokens": 9845086.0, "step": 1791, "train/ce_loss": 0.5043331384658813 }, { "epoch": 0.17708127348230177, "step": 1791, "train/sim_loss": 0.09765625 }, { "epoch": 0.17708127348230177, "step": 1791, "train/total_loss": 0.14808955788612366 }, { "entropy": 9.08039665222168, "epoch": 0.17718014633181728, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 9850569.0, "step": 1792, "train/ce_loss": 1.0692877769470215 }, { "epoch": 0.17718014633181728, "step": 1792, "train/sim_loss": 0.0625 }, { "epoch": 0.17718014633181728, "step": 1792, "train/total_loss": 0.1694287806749344 }, { "entropy": 9.186040878295898, "epoch": 0.17727901918133282, "mean_token_accuracy": 0.7471655607223511, "num_tokens": 9856071.0, "step": 1793, "train/ce_loss": 1.0780153274536133 }, { "epoch": 0.17727901918133282, "step": 1793, "train/sim_loss": 0.08984375 }, { "epoch": 0.17727901918133282, "step": 1793, "train/total_loss": 0.19764527678489685 }, { "entropy": 9.27293586730957, "epoch": 0.17737789203084833, "mean_token_accuracy": 0.750325083732605, "num_tokens": 9861458.0, "step": 1794, "train/ce_loss": 1.1005361080169678 }, { "epoch": 0.17737789203084833, "step": 1794, "train/sim_loss": 0.0703125 }, { "epoch": 0.17737789203084833, "step": 1794, "train/total_loss": 0.18036611378192902 }, { "entropy": 9.056093215942383, "epoch": 0.17747676488036385, "mean_token_accuracy": 0.7219604253768921, "num_tokens": 9867046.0, "step": 1795, "train/ce_loss": 0.8646151423454285 }, { "epoch": 0.17747676488036385, "step": 1795, "train/sim_loss": 0.0859375 }, { "epoch": 0.17747676488036385, "step": 1795, "train/total_loss": 0.17239901423454285 }, { "entropy": 8.92136001586914, "epoch": 0.1775756377298794, "mean_token_accuracy": 0.7814499139785767, "num_tokens": 9872638.0, "step": 1796, "train/ce_loss": 0.5031859874725342 }, { "epoch": 0.1775756377298794, "step": 1796, "train/sim_loss": 0.06640625 }, { "epoch": 0.1775756377298794, "step": 1796, "train/total_loss": 0.11672484874725342 }, { "entropy": 8.886089324951172, "epoch": 0.1776745105793949, "mean_token_accuracy": 0.7422266602516174, "num_tokens": 9878290.0, "step": 1797, "train/ce_loss": 0.584120512008667 }, { "epoch": 0.1776745105793949, "step": 1797, "train/sim_loss": 0.08984375 }, { "epoch": 0.1776745105793949, "step": 1797, "train/total_loss": 0.14825579524040222 }, { "entropy": 8.870548248291016, "epoch": 0.17777338342891041, "mean_token_accuracy": 0.6900085210800171, "num_tokens": 9884061.0, "step": 1798, "train/ce_loss": 0.5736979842185974 }, { "epoch": 0.17777338342891041, "step": 1798, "train/sim_loss": 0.0703125 }, { "epoch": 0.17777338342891041, "step": 1798, "train/total_loss": 0.12768229842185974 }, { "entropy": 9.196345329284668, "epoch": 0.17787225627842596, "mean_token_accuracy": 0.7096773982048035, "num_tokens": 9889521.0, "step": 1799, "train/ce_loss": 1.0773898363113403 }, { "epoch": 0.17787225627842596, "step": 1799, "train/sim_loss": 0.0625 }, { "epoch": 0.17787225627842596, "step": 1799, "train/total_loss": 0.17023898661136627 }, { "epoch": 0.17797112912794147, "grad_norm": 1.1318002939224243, "learning_rate": 9.557681847401474e-06, "loss": 0.1638, "step": 1800 }, { "entropy": 9.486862182617188, "epoch": 0.17797112912794147, "mean_token_accuracy": 0.7530674934387207, "num_tokens": 9894766.0, "step": 1800, "train/ce_loss": 0.6013689637184143 }, { "epoch": 0.17797112912794147, "step": 1800, "train/sim_loss": 0.0546875 }, { "epoch": 0.17797112912794147, "step": 1800, "train/total_loss": 0.11482439935207367 }, { "entropy": 9.29083251953125, "epoch": 0.17807000197745698, "mean_token_accuracy": 0.7557471394538879, "num_tokens": 9900044.0, "step": 1801, "train/ce_loss": 0.7010975480079651 }, { "epoch": 0.17807000197745698, "step": 1801, "train/sim_loss": 0.06640625 }, { "epoch": 0.17807000197745698, "step": 1801, "train/total_loss": 0.1365160048007965 }, { "entropy": 9.154129028320312, "epoch": 0.17816887482697252, "mean_token_accuracy": 0.7791208624839783, "num_tokens": 9905704.0, "step": 1802, "train/ce_loss": 0.8541761040687561 }, { "epoch": 0.17816887482697252, "step": 1802, "train/sim_loss": 0.06640625 }, { "epoch": 0.17816887482697252, "step": 1802, "train/total_loss": 0.15182386338710785 }, { "entropy": 9.16222858428955, "epoch": 0.17826774767648804, "mean_token_accuracy": 0.7230591177940369, "num_tokens": 9911160.0, "step": 1803, "train/ce_loss": 0.6022725105285645 }, { "epoch": 0.17826774767648804, "step": 1803, "train/sim_loss": 0.078125 }, { "epoch": 0.17826774767648804, "step": 1803, "train/total_loss": 0.13835224509239197 }, { "entropy": 9.229665756225586, "epoch": 0.17836662052600355, "mean_token_accuracy": 0.8054892420768738, "num_tokens": 9916650.0, "step": 1804, "train/ce_loss": 0.800669252872467 }, { "epoch": 0.17836662052600355, "step": 1804, "train/sim_loss": 0.03125 }, { "epoch": 0.17836662052600355, "step": 1804, "train/total_loss": 0.11131692677736282 }, { "entropy": 9.07126235961914, "epoch": 0.1784654933755191, "mean_token_accuracy": 0.7467072010040283, "num_tokens": 9922249.0, "step": 1805, "train/ce_loss": 1.032567024230957 }, { "epoch": 0.1784654933755191, "step": 1805, "train/sim_loss": 0.11328125 }, { "epoch": 0.1784654933755191, "step": 1805, "train/total_loss": 0.2165379524230957 }, { "entropy": 9.195982933044434, "epoch": 0.1785643662250346, "mean_token_accuracy": 0.7505747079849243, "num_tokens": 9927753.0, "step": 1806, "train/ce_loss": 0.6316457390785217 }, { "epoch": 0.1785643662250346, "step": 1806, "train/sim_loss": 0.12109375 }, { "epoch": 0.1785643662250346, "step": 1806, "train/total_loss": 0.1842583268880844 }, { "entropy": 8.950909614562988, "epoch": 0.17866323907455012, "mean_token_accuracy": 0.7997847199440002, "num_tokens": 9933361.0, "step": 1807, "train/ce_loss": 0.5539090633392334 }, { "epoch": 0.17866323907455012, "step": 1807, "train/sim_loss": 0.06640625 }, { "epoch": 0.17866323907455012, "step": 1807, "train/total_loss": 0.12179715931415558 }, { "entropy": 9.569194793701172, "epoch": 0.17876211192406566, "mean_token_accuracy": 0.7807407379150391, "num_tokens": 9938632.0, "step": 1808, "train/ce_loss": 0.5326206088066101 }, { "epoch": 0.17876211192406566, "step": 1808, "train/sim_loss": 0.08984375 }, { "epoch": 0.17876211192406566, "step": 1808, "train/total_loss": 0.14310580492019653 }, { "entropy": 9.03345775604248, "epoch": 0.17886098477358117, "mean_token_accuracy": 0.7472160458564758, "num_tokens": 9944106.0, "step": 1809, "train/ce_loss": 1.191928505897522 }, { "epoch": 0.17886098477358117, "step": 1809, "train/sim_loss": 0.09375 }, { "epoch": 0.17886098477358117, "step": 1809, "train/total_loss": 0.21294285356998444 }, { "entropy": 9.197105407714844, "epoch": 0.17895985762309669, "mean_token_accuracy": 0.7836322784423828, "num_tokens": 9949627.0, "step": 1810, "train/ce_loss": 0.9575303196907043 }, { "epoch": 0.17895985762309669, "step": 1810, "train/sim_loss": 0.08203125 }, { "epoch": 0.17895985762309669, "step": 1810, "train/total_loss": 0.1777842938899994 }, { "entropy": 9.526527404785156, "epoch": 0.17905873047261223, "mean_token_accuracy": 0.7224669456481934, "num_tokens": 9954810.0, "step": 1811, "train/ce_loss": 0.8765100836753845 }, { "epoch": 0.17905873047261223, "step": 1811, "train/sim_loss": 0.0859375 }, { "epoch": 0.17905873047261223, "step": 1811, "train/total_loss": 0.17358851432800293 }, { "entropy": 9.545660018920898, "epoch": 0.17915760332212774, "mean_token_accuracy": 0.7399425506591797, "num_tokens": 9960064.0, "step": 1812, "train/ce_loss": 1.0299538373947144 }, { "epoch": 0.17915760332212774, "step": 1812, "train/sim_loss": 0.0546875 }, { "epoch": 0.17915760332212774, "step": 1812, "train/total_loss": 0.1576828956604004 }, { "entropy": 8.890352249145508, "epoch": 0.17925647617164325, "mean_token_accuracy": 0.7229268550872803, "num_tokens": 9965743.0, "step": 1813, "train/ce_loss": 1.0172638893127441 }, { "epoch": 0.17925647617164325, "step": 1813, "train/sim_loss": 0.05078125 }, { "epoch": 0.17925647617164325, "step": 1813, "train/total_loss": 0.15250763297080994 }, { "entropy": 9.353113174438477, "epoch": 0.1793553490211588, "mean_token_accuracy": 0.754807710647583, "num_tokens": 9971410.0, "step": 1814, "train/ce_loss": 1.043186902999878 }, { "epoch": 0.1793553490211588, "step": 1814, "train/sim_loss": 0.10546875 }, { "epoch": 0.1793553490211588, "step": 1814, "train/total_loss": 0.20978744328022003 }, { "entropy": 9.366227149963379, "epoch": 0.1794542218706743, "mean_token_accuracy": 0.7997416257858276, "num_tokens": 9976747.0, "step": 1815, "train/ce_loss": 0.48395299911499023 }, { "epoch": 0.1794542218706743, "step": 1815, "train/sim_loss": 0.03515625 }, { "epoch": 0.1794542218706743, "step": 1815, "train/total_loss": 0.0835515558719635 }, { "entropy": 9.336734771728516, "epoch": 0.17955309472018985, "mean_token_accuracy": 0.7596996426582336, "num_tokens": 9982153.0, "step": 1816, "train/ce_loss": 0.9187691807746887 }, { "epoch": 0.17955309472018985, "step": 1816, "train/sim_loss": 0.03125 }, { "epoch": 0.17955309472018985, "step": 1816, "train/total_loss": 0.12312691658735275 }, { "entropy": 9.275434494018555, "epoch": 0.17965196756970536, "mean_token_accuracy": 0.7322834730148315, "num_tokens": 9987547.0, "step": 1817, "train/ce_loss": 0.5449948310852051 }, { "epoch": 0.17965196756970536, "step": 1817, "train/sim_loss": 0.046875 }, { "epoch": 0.17965196756970536, "step": 1817, "train/total_loss": 0.10137448459863663 }, { "entropy": 9.246600151062012, "epoch": 0.17975084041922088, "mean_token_accuracy": 0.7320799231529236, "num_tokens": 9992941.0, "step": 1818, "train/ce_loss": 0.8851469159126282 }, { "epoch": 0.17975084041922088, "step": 1818, "train/sim_loss": 0.078125 }, { "epoch": 0.17975084041922088, "step": 1818, "train/total_loss": 0.16663968563079834 }, { "entropy": 9.182323455810547, "epoch": 0.17984971326873642, "mean_token_accuracy": 0.7446808218955994, "num_tokens": 9998472.0, "step": 1819, "train/ce_loss": 0.7657042145729065 }, { "epoch": 0.17984971326873642, "step": 1819, "train/sim_loss": 0.0546875 }, { "epoch": 0.17984971326873642, "step": 1819, "train/total_loss": 0.13125792145729065 }, { "epoch": 0.17994858611825193, "grad_norm": 0.9131407737731934, "learning_rate": 9.552736982643526e-06, "loss": 0.1472, "step": 1820 }, { "entropy": 9.436616897583008, "epoch": 0.17994858611825193, "mean_token_accuracy": 0.6997354626655579, "num_tokens": 10003787.0, "step": 1820, "train/ce_loss": 0.7963200211524963 }, { "epoch": 0.17994858611825193, "step": 1820, "train/sim_loss": 0.0859375 }, { "epoch": 0.17994858611825193, "step": 1820, "train/total_loss": 0.1655695140361786 }, { "entropy": 9.044649124145508, "epoch": 0.18004745896776744, "mean_token_accuracy": 0.7448359727859497, "num_tokens": 10009237.0, "step": 1821, "train/ce_loss": 0.5059396028518677 }, { "epoch": 0.18004745896776744, "step": 1821, "train/sim_loss": 0.0546875 }, { "epoch": 0.18004745896776744, "step": 1821, "train/total_loss": 0.10528145730495453 }, { "entropy": 9.456413269042969, "epoch": 0.18014633181728298, "mean_token_accuracy": 0.7551867365837097, "num_tokens": 10014480.0, "step": 1822, "train/ce_loss": 0.574988603591919 }, { "epoch": 0.18014633181728298, "step": 1822, "train/sim_loss": 0.02734375 }, { "epoch": 0.18014633181728298, "step": 1822, "train/total_loss": 0.08484260737895966 }, { "entropy": 9.112212181091309, "epoch": 0.1802452046667985, "mean_token_accuracy": 0.7534403800964355, "num_tokens": 10019919.0, "step": 1823, "train/ce_loss": 0.9978711009025574 }, { "epoch": 0.1802452046667985, "step": 1823, "train/sim_loss": 0.09375 }, { "epoch": 0.1802452046667985, "step": 1823, "train/total_loss": 0.19353711605072021 }, { "entropy": 8.900737762451172, "epoch": 0.180344077516314, "mean_token_accuracy": 0.7857889533042908, "num_tokens": 10025534.0, "step": 1824, "train/ce_loss": 0.9477813243865967 }, { "epoch": 0.180344077516314, "step": 1824, "train/sim_loss": 0.05859375 }, { "epoch": 0.180344077516314, "step": 1824, "train/total_loss": 0.1533718854188919 }, { "entropy": 9.098814010620117, "epoch": 0.18044295036582955, "mean_token_accuracy": 0.7451691031455994, "num_tokens": 10031045.0, "step": 1825, "train/ce_loss": 0.8337472081184387 }, { "epoch": 0.18044295036582955, "step": 1825, "train/sim_loss": 0.0703125 }, { "epoch": 0.18044295036582955, "step": 1825, "train/total_loss": 0.1536872237920761 }, { "entropy": 8.901565551757812, "epoch": 0.18054182321534507, "mean_token_accuracy": 0.6921659111976624, "num_tokens": 10036635.0, "step": 1826, "train/ce_loss": 1.2588043212890625 }, { "epoch": 0.18054182321534507, "step": 1826, "train/sim_loss": 0.06640625 }, { "epoch": 0.18054182321534507, "step": 1826, "train/total_loss": 0.1922866851091385 }, { "entropy": 9.076316833496094, "epoch": 0.18064069606486058, "mean_token_accuracy": 0.7365145087242126, "num_tokens": 10042171.0, "step": 1827, "train/ce_loss": 0.8818045258522034 }, { "epoch": 0.18064069606486058, "step": 1827, "train/sim_loss": 0.0703125 }, { "epoch": 0.18064069606486058, "step": 1827, "train/total_loss": 0.15849295258522034 }, { "entropy": 9.27117919921875, "epoch": 0.18073956891437612, "mean_token_accuracy": 0.7347418069839478, "num_tokens": 10047635.0, "step": 1828, "train/ce_loss": 0.7899989485740662 }, { "epoch": 0.18073956891437612, "step": 1828, "train/sim_loss": 0.07421875 }, { "epoch": 0.18073956891437612, "step": 1828, "train/total_loss": 0.15321865677833557 }, { "entropy": 9.453166961669922, "epoch": 0.18083844176389163, "mean_token_accuracy": 0.709549069404602, "num_tokens": 10052937.0, "step": 1829, "train/ce_loss": 0.8667263388633728 }, { "epoch": 0.18083844176389163, "step": 1829, "train/sim_loss": 0.0859375 }, { "epoch": 0.18083844176389163, "step": 1829, "train/total_loss": 0.17261013388633728 }, { "entropy": 9.233354568481445, "epoch": 0.18093731461340715, "mean_token_accuracy": 0.7254437804222107, "num_tokens": 10058420.0, "step": 1830, "train/ce_loss": 0.6182413697242737 }, { "epoch": 0.18093731461340715, "step": 1830, "train/sim_loss": 0.0234375 }, { "epoch": 0.18093731461340715, "step": 1830, "train/total_loss": 0.08526164293289185 }, { "entropy": 9.286273002624512, "epoch": 0.1810361874629227, "mean_token_accuracy": 0.6981595158576965, "num_tokens": 10063816.0, "step": 1831, "train/ce_loss": 0.6372215747833252 }, { "epoch": 0.1810361874629227, "step": 1831, "train/sim_loss": 0.09375 }, { "epoch": 0.1810361874629227, "step": 1831, "train/total_loss": 0.157472163438797 }, { "entropy": 9.072953224182129, "epoch": 0.1811350603124382, "mean_token_accuracy": 0.7644394040107727, "num_tokens": 10069304.0, "step": 1832, "train/ce_loss": 0.6644747257232666 }, { "epoch": 0.1811350603124382, "step": 1832, "train/sim_loss": 0.078125 }, { "epoch": 0.1811350603124382, "step": 1832, "train/total_loss": 0.14457246661186218 }, { "entropy": 9.48664665222168, "epoch": 0.18123393316195371, "mean_token_accuracy": 0.763411283493042, "num_tokens": 10074641.0, "step": 1833, "train/ce_loss": 1.0459281206130981 }, { "epoch": 0.18123393316195371, "step": 1833, "train/sim_loss": 0.0390625 }, { "epoch": 0.18123393316195371, "step": 1833, "train/total_loss": 0.14365531504154205 }, { "entropy": 8.948578834533691, "epoch": 0.18133280601146926, "mean_token_accuracy": 0.7286384701728821, "num_tokens": 10080506.0, "step": 1834, "train/ce_loss": 0.8799277544021606 }, { "epoch": 0.18133280601146926, "step": 1834, "train/sim_loss": 0.08203125 }, { "epoch": 0.18133280601146926, "step": 1834, "train/total_loss": 0.17002403736114502 }, { "entropy": 9.329183578491211, "epoch": 0.18143167886098477, "mean_token_accuracy": 0.7810413837432861, "num_tokens": 10085898.0, "step": 1835, "train/ce_loss": 0.7028175592422485 }, { "epoch": 0.18143167886098477, "step": 1835, "train/sim_loss": 0.0546875 }, { "epoch": 0.18143167886098477, "step": 1835, "train/total_loss": 0.12496925890445709 }, { "entropy": 9.016178131103516, "epoch": 0.1815305517105003, "mean_token_accuracy": 0.7494481205940247, "num_tokens": 10091415.0, "step": 1836, "train/ce_loss": 0.6390730738639832 }, { "epoch": 0.1815305517105003, "step": 1836, "train/sim_loss": 0.09375 }, { "epoch": 0.1815305517105003, "step": 1836, "train/total_loss": 0.15765731036663055 }, { "entropy": 8.95097827911377, "epoch": 0.18162942456001582, "mean_token_accuracy": 0.7238403558731079, "num_tokens": 10097024.0, "step": 1837, "train/ce_loss": 0.3112987279891968 }, { "epoch": 0.18162942456001582, "step": 1837, "train/sim_loss": 0.02734375 }, { "epoch": 0.18162942456001582, "step": 1837, "train/total_loss": 0.0584736242890358 }, { "entropy": 9.377779006958008, "epoch": 0.18172829740953134, "mean_token_accuracy": 0.7448979616165161, "num_tokens": 10102340.0, "step": 1838, "train/ce_loss": 0.5122960805892944 }, { "epoch": 0.18172829740953134, "step": 1838, "train/sim_loss": 0.09375 }, { "epoch": 0.18172829740953134, "step": 1838, "train/total_loss": 0.14497961103916168 }, { "entropy": 9.462257385253906, "epoch": 0.18182717025904688, "mean_token_accuracy": 0.7342105507850647, "num_tokens": 10107724.0, "step": 1839, "train/ce_loss": 1.4329596757888794 }, { "epoch": 0.18182717025904688, "step": 1839, "train/sim_loss": 0.125 }, { "epoch": 0.18182717025904688, "step": 1839, "train/total_loss": 0.2682959735393524 }, { "epoch": 0.1819260431085624, "grad_norm": 0.8848437666893005, "learning_rate": 9.547792117885577e-06, "loss": 0.1568, "step": 1840 }, { "entropy": 9.152231216430664, "epoch": 0.1819260431085624, "mean_token_accuracy": 0.7606318593025208, "num_tokens": 10113148.0, "step": 1840, "train/ce_loss": 0.7314915657043457 }, { "epoch": 0.1819260431085624, "step": 1840, "train/sim_loss": 0.05078125 }, { "epoch": 0.1819260431085624, "step": 1840, "train/total_loss": 0.12393040955066681 }, { "entropy": 8.869501113891602, "epoch": 0.1820249159580779, "mean_token_accuracy": 0.7515528202056885, "num_tokens": 10118803.0, "step": 1841, "train/ce_loss": 0.7807665467262268 }, { "epoch": 0.1820249159580779, "step": 1841, "train/sim_loss": 0.09765625 }, { "epoch": 0.1820249159580779, "step": 1841, "train/total_loss": 0.17573291063308716 }, { "entropy": 9.336627960205078, "epoch": 0.18212378880759345, "mean_token_accuracy": 0.7450000047683716, "num_tokens": 10124224.0, "step": 1842, "train/ce_loss": 0.7422441244125366 }, { "epoch": 0.18212378880759345, "step": 1842, "train/sim_loss": 0.0234375 }, { "epoch": 0.18212378880759345, "step": 1842, "train/total_loss": 0.09766191244125366 }, { "entropy": 9.468748092651367, "epoch": 0.18222266165710896, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 10129555.0, "step": 1843, "train/ce_loss": 0.8200169205665588 }, { "epoch": 0.18222266165710896, "step": 1843, "train/sim_loss": 0.08203125 }, { "epoch": 0.18222266165710896, "step": 1843, "train/total_loss": 0.1640329360961914 }, { "entropy": 8.71921157836914, "epoch": 0.18232153450662447, "mean_token_accuracy": 0.7643040418624878, "num_tokens": 10135333.0, "step": 1844, "train/ce_loss": 1.1447553634643555 }, { "epoch": 0.18232153450662447, "step": 1844, "train/sim_loss": 0.09375 }, { "epoch": 0.18232153450662447, "step": 1844, "train/total_loss": 0.2082255482673645 }, { "entropy": 9.358652114868164, "epoch": 0.18242040735614, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 10140607.0, "step": 1845, "train/ce_loss": 0.7111326456069946 }, { "epoch": 0.18242040735614, "step": 1845, "train/sim_loss": 0.10546875 }, { "epoch": 0.18242040735614, "step": 1845, "train/total_loss": 0.17658200860023499 }, { "entropy": 9.202983856201172, "epoch": 0.18251928020565553, "mean_token_accuracy": 0.7687926888465881, "num_tokens": 10146087.0, "step": 1846, "train/ce_loss": 0.5673274993896484 }, { "epoch": 0.18251928020565553, "step": 1846, "train/sim_loss": 0.02734375 }, { "epoch": 0.18251928020565553, "step": 1846, "train/total_loss": 0.08407650142908096 }, { "entropy": 9.148828506469727, "epoch": 0.18261815305517104, "mean_token_accuracy": 0.7625979781150818, "num_tokens": 10151597.0, "step": 1847, "train/ce_loss": 0.7933956980705261 }, { "epoch": 0.18261815305517104, "step": 1847, "train/sim_loss": 0.1328125 }, { "epoch": 0.18261815305517104, "step": 1847, "train/total_loss": 0.21215206384658813 }, { "entropy": 9.225955963134766, "epoch": 0.18271702590468658, "mean_token_accuracy": 0.7288135886192322, "num_tokens": 10157006.0, "step": 1848, "train/ce_loss": 1.1209206581115723 }, { "epoch": 0.18271702590468658, "step": 1848, "train/sim_loss": 0.1328125 }, { "epoch": 0.18271702590468658, "step": 1848, "train/total_loss": 0.24490457773208618 }, { "entropy": 9.141236305236816, "epoch": 0.1828158987542021, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 10162281.0, "step": 1849, "train/ce_loss": 0.7110844850540161 }, { "epoch": 0.1828158987542021, "step": 1849, "train/sim_loss": 0.08984375 }, { "epoch": 0.1828158987542021, "step": 1849, "train/total_loss": 0.16095221042633057 }, { "entropy": 9.112945556640625, "epoch": 0.1829147716037176, "mean_token_accuracy": 0.7536723017692566, "num_tokens": 10167753.0, "step": 1850, "train/ce_loss": 1.000208854675293 }, { "epoch": 0.1829147716037176, "step": 1850, "train/sim_loss": 0.05859375 }, { "epoch": 0.1829147716037176, "step": 1850, "train/total_loss": 0.1586146354675293 }, { "entropy": 9.130107879638672, "epoch": 0.18301364445323315, "mean_token_accuracy": 0.7804877758026123, "num_tokens": 10173276.0, "step": 1851, "train/ce_loss": 0.9680579304695129 }, { "epoch": 0.18301364445323315, "step": 1851, "train/sim_loss": 0.1171875 }, { "epoch": 0.18301364445323315, "step": 1851, "train/total_loss": 0.21399329602718353 }, { "entropy": 8.902017593383789, "epoch": 0.18311251730274866, "mean_token_accuracy": 0.6965944170951843, "num_tokens": 10178866.0, "step": 1852, "train/ce_loss": 1.3198859691619873 }, { "epoch": 0.18311251730274866, "step": 1852, "train/sim_loss": 0.0703125 }, { "epoch": 0.18311251730274866, "step": 1852, "train/total_loss": 0.20230109989643097 }, { "entropy": 9.455095291137695, "epoch": 0.18321139015226418, "mean_token_accuracy": 0.7809139490127563, "num_tokens": 10184220.0, "step": 1853, "train/ce_loss": 0.6142395734786987 }, { "epoch": 0.18321139015226418, "step": 1853, "train/sim_loss": 0.03515625 }, { "epoch": 0.18321139015226418, "step": 1853, "train/total_loss": 0.09658020734786987 }, { "entropy": 9.29095458984375, "epoch": 0.18331026300177972, "mean_token_accuracy": 0.7231183052062988, "num_tokens": 10189591.0, "step": 1854, "train/ce_loss": 0.8456332087516785 }, { "epoch": 0.18331026300177972, "step": 1854, "train/sim_loss": 0.01953125 }, { "epoch": 0.18331026300177972, "step": 1854, "train/total_loss": 0.10409457236528397 }, { "entropy": 9.466253280639648, "epoch": 0.18340913585129523, "mean_token_accuracy": 0.7510668635368347, "num_tokens": 10194907.0, "step": 1855, "train/ce_loss": 0.9346392750740051 }, { "epoch": 0.18340913585129523, "step": 1855, "train/sim_loss": 0.046875 }, { "epoch": 0.18340913585129523, "step": 1855, "train/total_loss": 0.1403389275074005 }, { "entropy": 9.186201095581055, "epoch": 0.18350800870081077, "mean_token_accuracy": 0.7347995042800903, "num_tokens": 10200354.0, "step": 1856, "train/ce_loss": 0.9085715413093567 }, { "epoch": 0.18350800870081077, "step": 1856, "train/sim_loss": 0.10546875 }, { "epoch": 0.18350800870081077, "step": 1856, "train/total_loss": 0.1963258981704712 }, { "entropy": 9.485877990722656, "epoch": 0.18360688155032628, "mean_token_accuracy": 0.7536023259162903, "num_tokens": 10205610.0, "step": 1857, "train/ce_loss": 0.5484218597412109 }, { "epoch": 0.18360688155032628, "step": 1857, "train/sim_loss": 0.04296875 }, { "epoch": 0.18360688155032628, "step": 1857, "train/total_loss": 0.09781093895435333 }, { "entropy": 9.43083381652832, "epoch": 0.1837057543998418, "mean_token_accuracy": 0.6984536051750183, "num_tokens": 10211007.0, "step": 1858, "train/ce_loss": 1.0466711521148682 }, { "epoch": 0.1837057543998418, "step": 1858, "train/sim_loss": 0.05859375 }, { "epoch": 0.1837057543998418, "step": 1858, "train/total_loss": 0.16326087713241577 }, { "entropy": 9.362489700317383, "epoch": 0.18380462724935734, "mean_token_accuracy": 0.7592814564704895, "num_tokens": 10216417.0, "step": 1859, "train/ce_loss": 0.71994948387146 }, { "epoch": 0.18380462724935734, "step": 1859, "train/sim_loss": 0.0625 }, { "epoch": 0.18380462724935734, "step": 1859, "train/total_loss": 0.13449496030807495 }, { "epoch": 0.18390350009887285, "grad_norm": 0.8774592280387878, "learning_rate": 9.542847253127627e-06, "loss": 0.1484, "step": 1860 }, { "entropy": 9.394652366638184, "epoch": 0.18390350009887285, "mean_token_accuracy": 0.7270306348800659, "num_tokens": 10221827.0, "step": 1860, "train/ce_loss": 1.121170997619629 }, { "epoch": 0.18390350009887285, "step": 1860, "train/sim_loss": 0.0703125 }, { "epoch": 0.18390350009887285, "step": 1860, "train/total_loss": 0.18242961168289185 }, { "entropy": 8.903759956359863, "epoch": 0.18400237294838837, "mean_token_accuracy": 0.7591721415519714, "num_tokens": 10227571.0, "step": 1861, "train/ce_loss": 0.5812728404998779 }, { "epoch": 0.18400237294838837, "step": 1861, "train/sim_loss": 0.03125 }, { "epoch": 0.18400237294838837, "step": 1861, "train/total_loss": 0.08937728404998779 }, { "entropy": 9.047499656677246, "epoch": 0.1841012457979039, "mean_token_accuracy": 0.7352941036224365, "num_tokens": 10233113.0, "step": 1862, "train/ce_loss": 1.2952951192855835 }, { "epoch": 0.1841012457979039, "step": 1862, "train/sim_loss": 0.08984375 }, { "epoch": 0.1841012457979039, "step": 1862, "train/total_loss": 0.21937327086925507 }, { "entropy": 9.147682189941406, "epoch": 0.18420011864741942, "mean_token_accuracy": 0.7579908967018127, "num_tokens": 10238574.0, "step": 1863, "train/ce_loss": 0.6682887077331543 }, { "epoch": 0.18420011864741942, "step": 1863, "train/sim_loss": 0.05078125 }, { "epoch": 0.18420011864741942, "step": 1863, "train/total_loss": 0.11761011928319931 }, { "entropy": 8.944488525390625, "epoch": 0.18429899149693493, "mean_token_accuracy": 0.7125803232192993, "num_tokens": 10244202.0, "step": 1864, "train/ce_loss": 0.6469882130622864 }, { "epoch": 0.18429899149693493, "step": 1864, "train/sim_loss": 0.046875 }, { "epoch": 0.18429899149693493, "step": 1864, "train/total_loss": 0.11157382279634476 }, { "entropy": 9.171306610107422, "epoch": 0.18439786434645047, "mean_token_accuracy": 0.6847237348556519, "num_tokens": 10249716.0, "step": 1865, "train/ce_loss": 1.146244764328003 }, { "epoch": 0.18439786434645047, "step": 1865, "train/sim_loss": 0.12890625 }, { "epoch": 0.18439786434645047, "step": 1865, "train/total_loss": 0.24353072047233582 }, { "entropy": 9.11307430267334, "epoch": 0.184496737195966, "mean_token_accuracy": 0.6980306506156921, "num_tokens": 10255310.0, "step": 1866, "train/ce_loss": 0.7134320139884949 }, { "epoch": 0.184496737195966, "step": 1866, "train/sim_loss": 0.0859375 }, { "epoch": 0.184496737195966, "step": 1866, "train/total_loss": 0.15728071331977844 }, { "entropy": 9.33814811706543, "epoch": 0.1845956100454815, "mean_token_accuracy": 0.7289719581604004, "num_tokens": 10260729.0, "step": 1867, "train/ce_loss": 0.9380409121513367 }, { "epoch": 0.1845956100454815, "step": 1867, "train/sim_loss": 0.0703125 }, { "epoch": 0.1845956100454815, "step": 1867, "train/total_loss": 0.16411659121513367 }, { "entropy": 9.131216049194336, "epoch": 0.18469448289499704, "mean_token_accuracy": 0.7578475475311279, "num_tokens": 10266293.0, "step": 1868, "train/ce_loss": 0.5835964679718018 }, { "epoch": 0.18469448289499704, "step": 1868, "train/sim_loss": 0.06640625 }, { "epoch": 0.18469448289499704, "step": 1868, "train/total_loss": 0.12476590275764465 }, { "entropy": 9.43834400177002, "epoch": 0.18479335574451256, "mean_token_accuracy": 0.7378129363059998, "num_tokens": 10271666.0, "step": 1869, "train/ce_loss": 0.6353655457496643 }, { "epoch": 0.18479335574451256, "step": 1869, "train/sim_loss": 0.0546875 }, { "epoch": 0.18479335574451256, "step": 1869, "train/total_loss": 0.11822405457496643 }, { "entropy": 9.326549530029297, "epoch": 0.18489222859402807, "mean_token_accuracy": 0.7279151678085327, "num_tokens": 10277140.0, "step": 1870, "train/ce_loss": 0.6107400059700012 }, { "epoch": 0.18489222859402807, "step": 1870, "train/sim_loss": 0.05078125 }, { "epoch": 0.18489222859402807, "step": 1870, "train/total_loss": 0.11185525357723236 }, { "entropy": 9.446032524108887, "epoch": 0.1849911014435436, "mean_token_accuracy": 0.7622842192649841, "num_tokens": 10282405.0, "step": 1871, "train/ce_loss": 0.7769367098808289 }, { "epoch": 0.1849911014435436, "step": 1871, "train/sim_loss": 0.03125 }, { "epoch": 0.1849911014435436, "step": 1871, "train/total_loss": 0.10894367098808289 }, { "entropy": 9.393817901611328, "epoch": 0.18508997429305912, "mean_token_accuracy": 0.7222982048988342, "num_tokens": 10287785.0, "step": 1872, "train/ce_loss": 0.570745587348938 }, { "epoch": 0.18508997429305912, "step": 1872, "train/sim_loss": 0.12890625 }, { "epoch": 0.18508997429305912, "step": 1872, "train/total_loss": 0.18598081171512604 }, { "entropy": 9.492561340332031, "epoch": 0.18518884714257464, "mean_token_accuracy": 0.729764997959137, "num_tokens": 10293030.0, "step": 1873, "train/ce_loss": 0.7754833698272705 }, { "epoch": 0.18518884714257464, "step": 1873, "train/sim_loss": 0.046875 }, { "epoch": 0.18518884714257464, "step": 1873, "train/total_loss": 0.12442333996295929 }, { "entropy": 9.435754776000977, "epoch": 0.18528771999209018, "mean_token_accuracy": 0.7375327944755554, "num_tokens": 10298390.0, "step": 1874, "train/ce_loss": 0.4507581293582916 }, { "epoch": 0.18528771999209018, "step": 1874, "train/sim_loss": 0.1171875 }, { "epoch": 0.18528771999209018, "step": 1874, "train/total_loss": 0.16226331889629364 }, { "entropy": 9.155003547668457, "epoch": 0.1853865928416057, "mean_token_accuracy": 0.7387606501579285, "num_tokens": 10303844.0, "step": 1875, "train/ce_loss": 0.4662003815174103 }, { "epoch": 0.1853865928416057, "step": 1875, "train/sim_loss": 0.0234375 }, { "epoch": 0.1853865928416057, "step": 1875, "train/total_loss": 0.07005754113197327 }, { "entropy": 8.850626945495605, "epoch": 0.18548546569112123, "mean_token_accuracy": 0.695695698261261, "num_tokens": 10309539.0, "step": 1876, "train/ce_loss": 0.7083299160003662 }, { "epoch": 0.18548546569112123, "step": 1876, "train/sim_loss": 0.09375 }, { "epoch": 0.18548546569112123, "step": 1876, "train/total_loss": 0.1645829975605011 }, { "entropy": 9.221329689025879, "epoch": 0.18558433854063675, "mean_token_accuracy": 0.7567873597145081, "num_tokens": 10315025.0, "step": 1877, "train/ce_loss": 1.7331838607788086 }, { "epoch": 0.18558433854063675, "step": 1877, "train/sim_loss": 0.09765625 }, { "epoch": 0.18558433854063675, "step": 1877, "train/total_loss": 0.27097463607788086 }, { "entropy": 9.320459365844727, "epoch": 0.18568321139015226, "mean_token_accuracy": 0.756892204284668, "num_tokens": 10320416.0, "step": 1878, "train/ce_loss": 0.42243868112564087 }, { "epoch": 0.18568321139015226, "step": 1878, "train/sim_loss": 0.05859375 }, { "epoch": 0.18568321139015226, "step": 1878, "train/total_loss": 0.10083761811256409 }, { "entropy": 9.487055778503418, "epoch": 0.1857820842396678, "mean_token_accuracy": 0.7665198445320129, "num_tokens": 10325709.0, "step": 1879, "train/ce_loss": 1.0373822450637817 }, { "epoch": 0.1857820842396678, "step": 1879, "train/sim_loss": 0.09375 }, { "epoch": 0.1857820842396678, "step": 1879, "train/total_loss": 0.1974882185459137 }, { "epoch": 0.1858809570891833, "grad_norm": 1.022047758102417, "learning_rate": 9.53790238836968e-06, "loss": 0.1641, "step": 1880 }, { "entropy": 9.258097648620605, "epoch": 0.1858809570891833, "mean_token_accuracy": 0.7581920623779297, "num_tokens": 10331153.0, "step": 1880, "train/ce_loss": 1.0120632648468018 }, { "epoch": 0.1858809570891833, "step": 1880, "train/sim_loss": 0.0703125 }, { "epoch": 0.1858809570891833, "step": 1880, "train/total_loss": 0.17151883244514465 }, { "entropy": 9.329315185546875, "epoch": 0.18597982993869883, "mean_token_accuracy": 0.7146464586257935, "num_tokens": 10336589.0, "step": 1881, "train/ce_loss": 1.1716697216033936 }, { "epoch": 0.18597982993869883, "step": 1881, "train/sim_loss": 0.1015625 }, { "epoch": 0.18597982993869883, "step": 1881, "train/total_loss": 0.21872946619987488 }, { "entropy": 9.09282398223877, "epoch": 0.18607870278821437, "mean_token_accuracy": 0.7692307829856873, "num_tokens": 10342152.0, "step": 1882, "train/ce_loss": 0.5545892715454102 }, { "epoch": 0.18607870278821437, "step": 1882, "train/sim_loss": 0.03125 }, { "epoch": 0.18607870278821437, "step": 1882, "train/total_loss": 0.0867089331150055 }, { "entropy": 9.109855651855469, "epoch": 0.18617757563772988, "mean_token_accuracy": 0.7334878444671631, "num_tokens": 10347696.0, "step": 1883, "train/ce_loss": 1.3491941690444946 }, { "epoch": 0.18617757563772988, "step": 1883, "train/sim_loss": 0.09375 }, { "epoch": 0.18617757563772988, "step": 1883, "train/total_loss": 0.2286694198846817 }, { "entropy": 9.331661224365234, "epoch": 0.1862764484872454, "mean_token_accuracy": 0.7227488160133362, "num_tokens": 10353060.0, "step": 1884, "train/ce_loss": 1.0564920902252197 }, { "epoch": 0.1862764484872454, "step": 1884, "train/sim_loss": 0.0625 }, { "epoch": 0.1862764484872454, "step": 1884, "train/total_loss": 0.1681492030620575 }, { "entropy": 9.109003067016602, "epoch": 0.18637532133676094, "mean_token_accuracy": 0.7825630307197571, "num_tokens": 10358588.0, "step": 1885, "train/ce_loss": 0.4258757531642914 }, { "epoch": 0.18637532133676094, "step": 1885, "train/sim_loss": 0.03125 }, { "epoch": 0.18637532133676094, "step": 1885, "train/total_loss": 0.07383757829666138 }, { "entropy": 9.401508331298828, "epoch": 0.18647419418627645, "mean_token_accuracy": 0.7696709632873535, "num_tokens": 10363920.0, "step": 1886, "train/ce_loss": 0.560555100440979 }, { "epoch": 0.18647419418627645, "step": 1886, "train/sim_loss": 0.1015625 }, { "epoch": 0.18647419418627645, "step": 1886, "train/total_loss": 0.15761801600456238 }, { "entropy": 8.887173652648926, "epoch": 0.18657306703579196, "mean_token_accuracy": 0.7274418473243713, "num_tokens": 10369589.0, "step": 1887, "train/ce_loss": 1.0587177276611328 }, { "epoch": 0.18657306703579196, "step": 1887, "train/sim_loss": 0.0625 }, { "epoch": 0.18657306703579196, "step": 1887, "train/total_loss": 0.1683717668056488 }, { "entropy": 9.262166976928711, "epoch": 0.1866719398853075, "mean_token_accuracy": 0.6955941319465637, "num_tokens": 10374970.0, "step": 1888, "train/ce_loss": 0.8957780003547668 }, { "epoch": 0.1866719398853075, "step": 1888, "train/sim_loss": 0.078125 }, { "epoch": 0.1866719398853075, "step": 1888, "train/total_loss": 0.1677027940750122 }, { "entropy": 9.277833938598633, "epoch": 0.18677081273482302, "mean_token_accuracy": 0.7582159638404846, "num_tokens": 10380477.0, "step": 1889, "train/ce_loss": 0.6836897730827332 }, { "epoch": 0.18677081273482302, "step": 1889, "train/sim_loss": 0.01953125 }, { "epoch": 0.18677081273482302, "step": 1889, "train/total_loss": 0.08790022879838943 }, { "entropy": 8.429864883422852, "epoch": 0.18686968558433853, "mean_token_accuracy": 0.6898569464683533, "num_tokens": 10386518.0, "step": 1890, "train/ce_loss": 0.25205501914024353 }, { "epoch": 0.18686968558433853, "step": 1890, "train/sim_loss": 0.07421875 }, { "epoch": 0.18686968558433853, "step": 1890, "train/total_loss": 0.09942425042390823 }, { "entropy": 9.160074234008789, "epoch": 0.18696855843385407, "mean_token_accuracy": 0.7471655607223511, "num_tokens": 10392007.0, "step": 1891, "train/ce_loss": 0.6151145100593567 }, { "epoch": 0.18696855843385407, "step": 1891, "train/sim_loss": 0.09375 }, { "epoch": 0.18696855843385407, "step": 1891, "train/total_loss": 0.15526145696640015 }, { "entropy": 9.33702278137207, "epoch": 0.18706743128336958, "mean_token_accuracy": 0.7566204071044922, "num_tokens": 10397384.0, "step": 1892, "train/ce_loss": 0.7721154093742371 }, { "epoch": 0.18706743128336958, "step": 1892, "train/sim_loss": 0.08984375 }, { "epoch": 0.18706743128336958, "step": 1892, "train/total_loss": 0.16705529391765594 }, { "entropy": 9.01545238494873, "epoch": 0.1871663041328851, "mean_token_accuracy": 0.7481243014335632, "num_tokens": 10402923.0, "step": 1893, "train/ce_loss": 0.766142725944519 }, { "epoch": 0.1871663041328851, "step": 1893, "train/sim_loss": 0.03125 }, { "epoch": 0.1871663041328851, "step": 1893, "train/total_loss": 0.10786427557468414 }, { "entropy": 9.325172424316406, "epoch": 0.18726517698240064, "mean_token_accuracy": 0.7276073694229126, "num_tokens": 10408277.0, "step": 1894, "train/ce_loss": 0.841708779335022 }, { "epoch": 0.18726517698240064, "step": 1894, "train/sim_loss": 0.109375 }, { "epoch": 0.18726517698240064, "step": 1894, "train/total_loss": 0.1935458779335022 }, { "entropy": 9.100358963012695, "epoch": 0.18736404983191615, "mean_token_accuracy": 0.7456846833229065, "num_tokens": 10413816.0, "step": 1895, "train/ce_loss": 0.9738805890083313 }, { "epoch": 0.18736404983191615, "step": 1895, "train/sim_loss": 0.0703125 }, { "epoch": 0.18736404983191615, "step": 1895, "train/total_loss": 0.16770055890083313 }, { "entropy": 9.204476356506348, "epoch": 0.18746292268143167, "mean_token_accuracy": 0.7633832693099976, "num_tokens": 10419376.0, "step": 1896, "train/ce_loss": 0.4537072479724884 }, { "epoch": 0.18746292268143167, "step": 1896, "train/sim_loss": 0.02734375 }, { "epoch": 0.18746292268143167, "step": 1896, "train/total_loss": 0.07271447777748108 }, { "entropy": 9.247196197509766, "epoch": 0.1875617955309472, "mean_token_accuracy": 0.675644040107727, "num_tokens": 10424864.0, "step": 1897, "train/ce_loss": 1.6643643379211426 }, { "epoch": 0.1875617955309472, "step": 1897, "train/sim_loss": 0.09765625 }, { "epoch": 0.1875617955309472, "step": 1897, "train/total_loss": 0.26409268379211426 }, { "entropy": 9.09526252746582, "epoch": 0.18766066838046272, "mean_token_accuracy": 0.6876938939094543, "num_tokens": 10430384.0, "step": 1898, "train/ce_loss": 0.8235873579978943 }, { "epoch": 0.18766066838046272, "step": 1898, "train/sim_loss": 0.06640625 }, { "epoch": 0.18766066838046272, "step": 1898, "train/total_loss": 0.14876499772071838 }, { "entropy": 8.988293647766113, "epoch": 0.18775954122997826, "mean_token_accuracy": 0.7006869316101074, "num_tokens": 10435957.0, "step": 1899, "train/ce_loss": 0.5891604423522949 }, { "epoch": 0.18775954122997826, "step": 1899, "train/sim_loss": 0.08203125 }, { "epoch": 0.18775954122997826, "step": 1899, "train/total_loss": 0.14094729721546173 }, { "epoch": 0.18785841407949377, "grad_norm": 0.976150631904602, "learning_rate": 9.53295752361173e-06, "loss": 0.1561, "step": 1900 }, { "entropy": 9.667215347290039, "epoch": 0.18785841407949377, "mean_token_accuracy": 0.7836611270904541, "num_tokens": 10441214.0, "step": 1900, "train/ce_loss": 0.7014263868331909 }, { "epoch": 0.18785841407949377, "step": 1900, "train/sim_loss": 0.0625 }, { "epoch": 0.18785841407949377, "step": 1900, "train/total_loss": 0.13264264166355133 }, { "entropy": 9.409549713134766, "epoch": 0.1879572869290093, "mean_token_accuracy": 0.6941340565681458, "num_tokens": 10446542.0, "step": 1901, "train/ce_loss": 0.834379255771637 }, { "epoch": 0.1879572869290093, "step": 1901, "train/sim_loss": 0.078125 }, { "epoch": 0.1879572869290093, "step": 1901, "train/total_loss": 0.16156291961669922 }, { "entropy": 9.527002334594727, "epoch": 0.18805615977852483, "mean_token_accuracy": 0.7380585670471191, "num_tokens": 10451780.0, "step": 1902, "train/ce_loss": 1.0783882141113281 }, { "epoch": 0.18805615977852483, "step": 1902, "train/sim_loss": 0.09375 }, { "epoch": 0.18805615977852483, "step": 1902, "train/total_loss": 0.20158882439136505 }, { "entropy": 9.232093811035156, "epoch": 0.18815503262804034, "mean_token_accuracy": 0.7309874892234802, "num_tokens": 10457254.0, "step": 1903, "train/ce_loss": 0.6993353962898254 }, { "epoch": 0.18815503262804034, "step": 1903, "train/sim_loss": 0.02734375 }, { "epoch": 0.18815503262804034, "step": 1903, "train/total_loss": 0.09727729111909866 }, { "entropy": 9.227006912231445, "epoch": 0.18825390547755586, "mean_token_accuracy": 0.7410604357719421, "num_tokens": 10462667.0, "step": 1904, "train/ce_loss": 0.7635419964790344 }, { "epoch": 0.18825390547755586, "step": 1904, "train/sim_loss": 0.1796875 }, { "epoch": 0.18825390547755586, "step": 1904, "train/total_loss": 0.2560417056083679 }, { "entropy": 9.256549835205078, "epoch": 0.1883527783270714, "mean_token_accuracy": 0.768961489200592, "num_tokens": 10468086.0, "step": 1905, "train/ce_loss": 0.5879771709442139 }, { "epoch": 0.1883527783270714, "step": 1905, "train/sim_loss": 0.04296875 }, { "epoch": 0.1883527783270714, "step": 1905, "train/total_loss": 0.10176646709442139 }, { "entropy": 8.949323654174805, "epoch": 0.1884516511765869, "mean_token_accuracy": 0.6836827993392944, "num_tokens": 10473761.0, "step": 1906, "train/ce_loss": 0.636362612247467 }, { "epoch": 0.1884516511765869, "step": 1906, "train/sim_loss": 0.1171875 }, { "epoch": 0.1884516511765869, "step": 1906, "train/total_loss": 0.18082377314567566 }, { "entropy": 9.501606941223145, "epoch": 0.18855052402610242, "mean_token_accuracy": 0.7211796045303345, "num_tokens": 10479048.0, "step": 1907, "train/ce_loss": 0.4670999348163605 }, { "epoch": 0.18855052402610242, "step": 1907, "train/sim_loss": 0.0546875 }, { "epoch": 0.18855052402610242, "step": 1907, "train/total_loss": 0.10139749944210052 }, { "entropy": 9.398838996887207, "epoch": 0.18864939687561796, "mean_token_accuracy": 0.7605262994766235, "num_tokens": 10484423.0, "step": 1908, "train/ce_loss": 0.8354304432868958 }, { "epoch": 0.18864939687561796, "step": 1908, "train/sim_loss": 0.078125 }, { "epoch": 0.18864939687561796, "step": 1908, "train/total_loss": 0.16166804730892181 }, { "entropy": 9.43191146850586, "epoch": 0.18874826972513348, "mean_token_accuracy": 0.7411907911300659, "num_tokens": 10489782.0, "step": 1909, "train/ce_loss": 1.095832109451294 }, { "epoch": 0.18874826972513348, "step": 1909, "train/sim_loss": 0.09765625 }, { "epoch": 0.18874826972513348, "step": 1909, "train/total_loss": 0.20723946392536163 }, { "entropy": 9.131551742553711, "epoch": 0.188847142574649, "mean_token_accuracy": 0.6827033162117004, "num_tokens": 10495335.0, "step": 1910, "train/ce_loss": 0.9664953947067261 }, { "epoch": 0.188847142574649, "step": 1910, "train/sim_loss": 0.09375 }, { "epoch": 0.188847142574649, "step": 1910, "train/total_loss": 0.19039954245090485 }, { "entropy": 9.151175498962402, "epoch": 0.18894601542416453, "mean_token_accuracy": 0.7675392627716064, "num_tokens": 10500882.0, "step": 1911, "train/ce_loss": 0.6827057600021362 }, { "epoch": 0.18894601542416453, "step": 1911, "train/sim_loss": 0.0625 }, { "epoch": 0.18894601542416453, "step": 1911, "train/total_loss": 0.13077057898044586 }, { "entropy": 8.908767700195312, "epoch": 0.18904488827368005, "mean_token_accuracy": 0.7428004145622253, "num_tokens": 10506552.0, "step": 1912, "train/ce_loss": 0.8164929747581482 }, { "epoch": 0.18904488827368005, "step": 1912, "train/sim_loss": 0.02734375 }, { "epoch": 0.18904488827368005, "step": 1912, "train/total_loss": 0.1089930459856987 }, { "entropy": 9.279717445373535, "epoch": 0.18914376112319556, "mean_token_accuracy": 0.7236679196357727, "num_tokens": 10511875.0, "step": 1913, "train/ce_loss": 1.2527661323547363 }, { "epoch": 0.18914376112319556, "step": 1913, "train/sim_loss": 0.09375 }, { "epoch": 0.18914376112319556, "step": 1913, "train/total_loss": 0.2190266102552414 }, { "entropy": 9.279006958007812, "epoch": 0.1892426339727111, "mean_token_accuracy": 0.7405966520309448, "num_tokens": 10517269.0, "step": 1914, "train/ce_loss": 0.7432740330696106 }, { "epoch": 0.1892426339727111, "step": 1914, "train/sim_loss": 0.08203125 }, { "epoch": 0.1892426339727111, "step": 1914, "train/total_loss": 0.15635865926742554 }, { "entropy": 9.053998947143555, "epoch": 0.1893415068222266, "mean_token_accuracy": 0.7301886677742004, "num_tokens": 10522926.0, "step": 1915, "train/ce_loss": 0.43673595786094666 }, { "epoch": 0.1893415068222266, "step": 1915, "train/sim_loss": 0.06640625 }, { "epoch": 0.1893415068222266, "step": 1915, "train/total_loss": 0.11007984727621078 }, { "entropy": 9.128483772277832, "epoch": 0.18944037967174213, "mean_token_accuracy": 0.7155067324638367, "num_tokens": 10528384.0, "step": 1916, "train/ce_loss": 0.715581476688385 }, { "epoch": 0.18944037967174213, "step": 1916, "train/sim_loss": 0.0625 }, { "epoch": 0.18944037967174213, "step": 1916, "train/total_loss": 0.1340581476688385 }, { "entropy": 9.32099437713623, "epoch": 0.18953925252125767, "mean_token_accuracy": 0.7512437701225281, "num_tokens": 10533761.0, "step": 1917, "train/ce_loss": 0.8493117690086365 }, { "epoch": 0.18953925252125767, "step": 1917, "train/sim_loss": 0.09765625 }, { "epoch": 0.18953925252125767, "step": 1917, "train/total_loss": 0.1825874298810959 }, { "entropy": 9.11954116821289, "epoch": 0.18963812537077318, "mean_token_accuracy": 0.7618576884269714, "num_tokens": 10539283.0, "step": 1918, "train/ce_loss": 1.8299731016159058 }, { "epoch": 0.18963812537077318, "step": 1918, "train/sim_loss": 0.1015625 }, { "epoch": 0.18963812537077318, "step": 1918, "train/total_loss": 0.28455981612205505 }, { "entropy": 9.195821762084961, "epoch": 0.18973699822028872, "mean_token_accuracy": 0.7028688788414001, "num_tokens": 10544825.0, "step": 1919, "train/ce_loss": 0.6317260265350342 }, { "epoch": 0.18973699822028872, "step": 1919, "train/sim_loss": 0.0234375 }, { "epoch": 0.18973699822028872, "step": 1919, "train/total_loss": 0.0866101011633873 }, { "epoch": 0.18983587106980424, "grad_norm": 0.8813591003417969, "learning_rate": 9.528012658853782e-06, "loss": 0.1609, "step": 1920 }, { "entropy": 9.130243301391602, "epoch": 0.18983587106980424, "mean_token_accuracy": 0.6583071947097778, "num_tokens": 10550434.0, "step": 1920, "train/ce_loss": 0.8273386359214783 }, { "epoch": 0.18983587106980424, "step": 1920, "train/sim_loss": 0.0703125 }, { "epoch": 0.18983587106980424, "step": 1920, "train/total_loss": 0.1530463695526123 }, { "entropy": 8.849374771118164, "epoch": 0.18993474391931975, "mean_token_accuracy": 0.6981519460678101, "num_tokens": 10556056.0, "step": 1921, "train/ce_loss": 0.7276865839958191 }, { "epoch": 0.18993474391931975, "step": 1921, "train/sim_loss": 0.140625 }, { "epoch": 0.18993474391931975, "step": 1921, "train/total_loss": 0.2133936583995819 }, { "entropy": 9.557662963867188, "epoch": 0.1900336167688353, "mean_token_accuracy": 0.7551299333572388, "num_tokens": 10561383.0, "step": 1922, "train/ce_loss": 0.8419260382652283 }, { "epoch": 0.1900336167688353, "step": 1922, "train/sim_loss": 0.02734375 }, { "epoch": 0.1900336167688353, "step": 1922, "train/total_loss": 0.11153635382652283 }, { "entropy": 9.061387062072754, "epoch": 0.1901324896183508, "mean_token_accuracy": 0.7462387084960938, "num_tokens": 10566897.0, "step": 1923, "train/ce_loss": 0.9490362405776978 }, { "epoch": 0.1901324896183508, "step": 1923, "train/sim_loss": 0.078125 }, { "epoch": 0.1901324896183508, "step": 1923, "train/total_loss": 0.1730286180973053 }, { "entropy": 9.011480331420898, "epoch": 0.19023136246786632, "mean_token_accuracy": 0.6948275566101074, "num_tokens": 10572594.0, "step": 1924, "train/ce_loss": 2.0057828426361084 }, { "epoch": 0.19023136246786632, "step": 1924, "train/sim_loss": 0.08203125 }, { "epoch": 0.19023136246786632, "step": 1924, "train/total_loss": 0.2826095223426819 }, { "entropy": 9.534296035766602, "epoch": 0.19033023531738186, "mean_token_accuracy": 0.7126436829566956, "num_tokens": 10578168.0, "step": 1925, "train/ce_loss": 1.1737700700759888 }, { "epoch": 0.19033023531738186, "step": 1925, "train/sim_loss": 0.0859375 }, { "epoch": 0.19033023531738186, "step": 1925, "train/total_loss": 0.20331451296806335 }, { "entropy": 9.608262062072754, "epoch": 0.19042910816689737, "mean_token_accuracy": 0.6747503280639648, "num_tokens": 10583495.0, "step": 1926, "train/ce_loss": 0.7353740334510803 }, { "epoch": 0.19042910816689737, "step": 1926, "train/sim_loss": 0.078125 }, { "epoch": 0.19042910816689737, "step": 1926, "train/total_loss": 0.1516624093055725 }, { "entropy": 9.507547378540039, "epoch": 0.19052798101641288, "mean_token_accuracy": 0.7727272510528564, "num_tokens": 10588825.0, "step": 1927, "train/ce_loss": 0.7609487771987915 }, { "epoch": 0.19052798101641288, "step": 1927, "train/sim_loss": 0.12109375 }, { "epoch": 0.19052798101641288, "step": 1927, "train/total_loss": 0.1971886307001114 }, { "entropy": 9.250419616699219, "epoch": 0.19062685386592843, "mean_token_accuracy": 0.7407002449035645, "num_tokens": 10594371.0, "step": 1928, "train/ce_loss": 0.7560986876487732 }, { "epoch": 0.19062685386592843, "step": 1928, "train/sim_loss": 0.0625 }, { "epoch": 0.19062685386592843, "step": 1928, "train/total_loss": 0.13810986280441284 }, { "entropy": 8.720457077026367, "epoch": 0.19072572671544394, "mean_token_accuracy": 0.7422680258750916, "num_tokens": 10600316.0, "step": 1929, "train/ce_loss": 0.3517307937145233 }, { "epoch": 0.19072572671544394, "step": 1929, "train/sim_loss": 0.08203125 }, { "epoch": 0.19072572671544394, "step": 1929, "train/total_loss": 0.11720433086156845 }, { "entropy": 9.429616928100586, "epoch": 0.19082459956495945, "mean_token_accuracy": 0.7428924441337585, "num_tokens": 10605670.0, "step": 1930, "train/ce_loss": 0.8023327589035034 }, { "epoch": 0.19082459956495945, "step": 1930, "train/sim_loss": 0.09375 }, { "epoch": 0.19082459956495945, "step": 1930, "train/total_loss": 0.17398327589035034 }, { "entropy": 9.23723030090332, "epoch": 0.190923472414475, "mean_token_accuracy": 0.7549889087677002, "num_tokens": 10611197.0, "step": 1931, "train/ce_loss": 0.5320907831192017 }, { "epoch": 0.190923472414475, "step": 1931, "train/sim_loss": 0.0390625 }, { "epoch": 0.190923472414475, "step": 1931, "train/total_loss": 0.0922715812921524 }, { "entropy": 9.179306030273438, "epoch": 0.1910223452639905, "mean_token_accuracy": 0.7129436135292053, "num_tokens": 10616748.0, "step": 1932, "train/ce_loss": 0.9169179797172546 }, { "epoch": 0.1910223452639905, "step": 1932, "train/sim_loss": 0.21875 }, { "epoch": 0.1910223452639905, "step": 1932, "train/total_loss": 0.310441792011261 }, { "entropy": 9.251175880432129, "epoch": 0.19112121811350602, "mean_token_accuracy": 0.7613776326179504, "num_tokens": 10622183.0, "step": 1933, "train/ce_loss": 0.6766993999481201 }, { "epoch": 0.19112121811350602, "step": 1933, "train/sim_loss": 0.04296875 }, { "epoch": 0.19112121811350602, "step": 1933, "train/total_loss": 0.11063869297504425 }, { "entropy": 9.19108772277832, "epoch": 0.19122009096302156, "mean_token_accuracy": 0.7399346828460693, "num_tokens": 10627739.0, "step": 1934, "train/ce_loss": 0.8071695566177368 }, { "epoch": 0.19122009096302156, "step": 1934, "train/sim_loss": 0.0703125 }, { "epoch": 0.19122009096302156, "step": 1934, "train/total_loss": 0.15102946758270264 }, { "entropy": 9.07003402709961, "epoch": 0.19131896381253707, "mean_token_accuracy": 0.7620041966438293, "num_tokens": 10633384.0, "step": 1935, "train/ce_loss": 1.4630893468856812 }, { "epoch": 0.19131896381253707, "step": 1935, "train/sim_loss": 0.09765625 }, { "epoch": 0.19131896381253707, "step": 1935, "train/total_loss": 0.24396519362926483 }, { "entropy": 9.489038467407227, "epoch": 0.1914178366620526, "mean_token_accuracy": 0.7400274872779846, "num_tokens": 10638717.0, "step": 1936, "train/ce_loss": 0.5636811852455139 }, { "epoch": 0.1914178366620526, "step": 1936, "train/sim_loss": 0.05859375 }, { "epoch": 0.1914178366620526, "step": 1936, "train/total_loss": 0.11496187001466751 }, { "entropy": 9.36467456817627, "epoch": 0.19151670951156813, "mean_token_accuracy": 0.707058846950531, "num_tokens": 10644172.0, "step": 1937, "train/ce_loss": 1.2295435667037964 }, { "epoch": 0.19151670951156813, "step": 1937, "train/sim_loss": 0.1015625 }, { "epoch": 0.19151670951156813, "step": 1937, "train/total_loss": 0.2245168685913086 }, { "entropy": 9.389647483825684, "epoch": 0.19161558236108364, "mean_token_accuracy": 0.6959287524223328, "num_tokens": 10649625.0, "step": 1938, "train/ce_loss": 0.9396371841430664 }, { "epoch": 0.19161558236108364, "step": 1938, "train/sim_loss": 0.05859375 }, { "epoch": 0.19161558236108364, "step": 1938, "train/total_loss": 0.15255746245384216 }, { "entropy": 9.440783500671387, "epoch": 0.19171445521059918, "mean_token_accuracy": 0.7524475455284119, "num_tokens": 10654968.0, "step": 1939, "train/ce_loss": 0.5464025139808655 }, { "epoch": 0.19171445521059918, "step": 1939, "train/sim_loss": 0.0703125 }, { "epoch": 0.19171445521059918, "step": 1939, "train/total_loss": 0.12495274841785431 }, { "epoch": 0.1918133280601147, "grad_norm": 0.8685644268989563, "learning_rate": 9.523067794095833e-06, "loss": 0.1663, "step": 1940 }, { "entropy": 8.91616153717041, "epoch": 0.1918133280601147, "mean_token_accuracy": 0.7251356244087219, "num_tokens": 10660731.0, "step": 1940, "train/ce_loss": 0.9099739789962769 }, { "epoch": 0.1918133280601147, "step": 1940, "train/sim_loss": 0.09375 }, { "epoch": 0.1918133280601147, "step": 1940, "train/total_loss": 0.18474739789962769 }, { "entropy": 9.024537086486816, "epoch": 0.1919122009096302, "mean_token_accuracy": 0.7058242559432983, "num_tokens": 10666328.0, "step": 1941, "train/ce_loss": 0.7703852653503418 }, { "epoch": 0.1919122009096302, "step": 1941, "train/sim_loss": 0.078125 }, { "epoch": 0.1919122009096302, "step": 1941, "train/total_loss": 0.15516352653503418 }, { "entropy": 9.478693008422852, "epoch": 0.19201107375914575, "mean_token_accuracy": 0.6799530982971191, "num_tokens": 10671747.0, "step": 1942, "train/ce_loss": 0.44840386509895325 }, { "epoch": 0.19201107375914575, "step": 1942, "train/sim_loss": 0.09765625 }, { "epoch": 0.19201107375914575, "step": 1942, "train/total_loss": 0.14249664545059204 }, { "entropy": 9.14114761352539, "epoch": 0.19210994660866126, "mean_token_accuracy": 0.792475700378418, "num_tokens": 10677204.0, "step": 1943, "train/ce_loss": 0.6565990447998047 }, { "epoch": 0.19210994660866126, "step": 1943, "train/sim_loss": 0.03515625 }, { "epoch": 0.19210994660866126, "step": 1943, "train/total_loss": 0.10081615298986435 }, { "entropy": 9.120071411132812, "epoch": 0.19220881945817678, "mean_token_accuracy": 0.7535070180892944, "num_tokens": 10682874.0, "step": 1944, "train/ce_loss": 0.8580129742622375 }, { "epoch": 0.19220881945817678, "step": 1944, "train/sim_loss": 0.08984375 }, { "epoch": 0.19220881945817678, "step": 1944, "train/total_loss": 0.17564505338668823 }, { "entropy": 9.042863845825195, "epoch": 0.19230769230769232, "mean_token_accuracy": 0.7194473743438721, "num_tokens": 10688416.0, "step": 1945, "train/ce_loss": 1.1533880233764648 }, { "epoch": 0.19230769230769232, "step": 1945, "train/sim_loss": 0.09375 }, { "epoch": 0.19230769230769232, "step": 1945, "train/total_loss": 0.20908880233764648 }, { "entropy": 9.360310554504395, "epoch": 0.19240656515720783, "mean_token_accuracy": 0.7220670580863953, "num_tokens": 10693766.0, "step": 1946, "train/ce_loss": 0.8523736596107483 }, { "epoch": 0.19240656515720783, "step": 1946, "train/sim_loss": 0.09375 }, { "epoch": 0.19240656515720783, "step": 1946, "train/total_loss": 0.17898736894130707 }, { "entropy": 9.021282196044922, "epoch": 0.19250543800672335, "mean_token_accuracy": 0.7301231622695923, "num_tokens": 10699234.0, "step": 1947, "train/ce_loss": 0.8170859217643738 }, { "epoch": 0.19250543800672335, "step": 1947, "train/sim_loss": 0.0234375 }, { "epoch": 0.19250543800672335, "step": 1947, "train/total_loss": 0.10514609515666962 }, { "entropy": 8.96097183227539, "epoch": 0.1926043108562389, "mean_token_accuracy": 0.7443763017654419, "num_tokens": 10704922.0, "step": 1948, "train/ce_loss": 0.9703499674797058 }, { "epoch": 0.1926043108562389, "step": 1948, "train/sim_loss": 0.09765625 }, { "epoch": 0.1926043108562389, "step": 1948, "train/total_loss": 0.1946912407875061 }, { "entropy": 9.380046844482422, "epoch": 0.1927031837057544, "mean_token_accuracy": 0.773809552192688, "num_tokens": 10710351.0, "step": 1949, "train/ce_loss": 0.6281152367591858 }, { "epoch": 0.1927031837057544, "step": 1949, "train/sim_loss": 0.0546875 }, { "epoch": 0.1927031837057544, "step": 1949, "train/total_loss": 0.11749902367591858 }, { "entropy": 9.372064590454102, "epoch": 0.1928020565552699, "mean_token_accuracy": 0.7818182110786438, "num_tokens": 10715794.0, "step": 1950, "train/ce_loss": 0.34171125292778015 }, { "epoch": 0.1928020565552699, "step": 1950, "train/sim_loss": 0.0703125 }, { "epoch": 0.1928020565552699, "step": 1950, "train/total_loss": 0.10448362678289413 }, { "entropy": 9.455821990966797, "epoch": 0.19290092940478545, "mean_token_accuracy": 0.7640750408172607, "num_tokens": 10721193.0, "step": 1951, "train/ce_loss": 0.6590455770492554 }, { "epoch": 0.19290092940478545, "step": 1951, "train/sim_loss": 0.078125 }, { "epoch": 0.19290092940478545, "step": 1951, "train/total_loss": 0.14402955770492554 }, { "entropy": 9.043667793273926, "epoch": 0.19299980225430097, "mean_token_accuracy": 0.7535954117774963, "num_tokens": 10726875.0, "step": 1952, "train/ce_loss": 0.2729942500591278 }, { "epoch": 0.19299980225430097, "step": 1952, "train/sim_loss": 0.07421875 }, { "epoch": 0.19299980225430097, "step": 1952, "train/total_loss": 0.1015181764960289 }, { "entropy": 8.89741325378418, "epoch": 0.19309867510381648, "mean_token_accuracy": 0.8040540814399719, "num_tokens": 10732567.0, "step": 1953, "train/ce_loss": 0.7339692115783691 }, { "epoch": 0.19309867510381648, "step": 1953, "train/sim_loss": 0.0625 }, { "epoch": 0.19309867510381648, "step": 1953, "train/total_loss": 0.13589692115783691 }, { "entropy": 9.34492301940918, "epoch": 0.19319754795333202, "mean_token_accuracy": 0.7396373152732849, "num_tokens": 10737989.0, "step": 1954, "train/ce_loss": 0.8463490009307861 }, { "epoch": 0.19319754795333202, "step": 1954, "train/sim_loss": 0.10546875 }, { "epoch": 0.19319754795333202, "step": 1954, "train/total_loss": 0.1901036500930786 }, { "entropy": 9.232549667358398, "epoch": 0.19329642080284754, "mean_token_accuracy": 0.6849315166473389, "num_tokens": 10743435.0, "step": 1955, "train/ce_loss": 0.9480928778648376 }, { "epoch": 0.19329642080284754, "step": 1955, "train/sim_loss": 0.0625 }, { "epoch": 0.19329642080284754, "step": 1955, "train/total_loss": 0.15730929374694824 }, { "entropy": 9.056482315063477, "epoch": 0.19339529365236305, "mean_token_accuracy": 0.8073394298553467, "num_tokens": 10748951.0, "step": 1956, "train/ce_loss": 0.50166255235672 }, { "epoch": 0.19339529365236305, "step": 1956, "train/sim_loss": 0.02734375 }, { "epoch": 0.19339529365236305, "step": 1956, "train/total_loss": 0.07751000672578812 }, { "entropy": 8.990050315856934, "epoch": 0.1934941665018786, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 10754579.0, "step": 1957, "train/ce_loss": 0.7086498737335205 }, { "epoch": 0.1934941665018786, "step": 1957, "train/sim_loss": 0.046875 }, { "epoch": 0.1934941665018786, "step": 1957, "train/total_loss": 0.11773999035358429 }, { "entropy": 9.293977737426758, "epoch": 0.1935930393513941, "mean_token_accuracy": 0.7521902322769165, "num_tokens": 10760006.0, "step": 1958, "train/ce_loss": 0.7628306746482849 }, { "epoch": 0.1935930393513941, "step": 1958, "train/sim_loss": 0.04296875 }, { "epoch": 0.1935930393513941, "step": 1958, "train/total_loss": 0.11925181746482849 }, { "entropy": 9.229662895202637, "epoch": 0.19369191220090964, "mean_token_accuracy": 0.6816720366477966, "num_tokens": 10765609.0, "step": 1959, "train/ce_loss": 1.8040238618850708 }, { "epoch": 0.19369191220090964, "step": 1959, "train/sim_loss": 0.09765625 }, { "epoch": 0.19369191220090964, "step": 1959, "train/total_loss": 0.27805864810943604 }, { "epoch": 0.19379078505042516, "grad_norm": 1.1649847030639648, "learning_rate": 9.518122929337883e-06, "loss": 0.1542, "step": 1960 }, { "entropy": 9.191696166992188, "epoch": 0.19379078505042516, "mean_token_accuracy": 0.7648484706878662, "num_tokens": 10771031.0, "step": 1960, "train/ce_loss": 0.9112334251403809 }, { "epoch": 0.19379078505042516, "step": 1960, "train/sim_loss": 0.13671875 }, { "epoch": 0.19379078505042516, "step": 1960, "train/total_loss": 0.22784209251403809 }, { "entropy": 9.19484806060791, "epoch": 0.19388965789994067, "mean_token_accuracy": 0.7973568439483643, "num_tokens": 10776558.0, "step": 1961, "train/ce_loss": 0.7289481163024902 }, { "epoch": 0.19388965789994067, "step": 1961, "train/sim_loss": 0.0546875 }, { "epoch": 0.19388965789994067, "step": 1961, "train/total_loss": 0.12758231163024902 }, { "entropy": 9.41118049621582, "epoch": 0.1939885307494562, "mean_token_accuracy": 0.7610738277435303, "num_tokens": 10781886.0, "step": 1962, "train/ce_loss": 0.6289187073707581 }, { "epoch": 0.1939885307494562, "step": 1962, "train/sim_loss": 0.09375 }, { "epoch": 0.1939885307494562, "step": 1962, "train/total_loss": 0.1566418707370758 }, { "entropy": 9.132506370544434, "epoch": 0.19408740359897173, "mean_token_accuracy": 0.8099273443222046, "num_tokens": 10787398.0, "step": 1963, "train/ce_loss": 0.6109817028045654 }, { "epoch": 0.19408740359897173, "step": 1963, "train/sim_loss": 0.09765625 }, { "epoch": 0.19408740359897173, "step": 1963, "train/total_loss": 0.15875442326068878 }, { "entropy": 9.25629711151123, "epoch": 0.19418627644848724, "mean_token_accuracy": 0.7314211130142212, "num_tokens": 10792891.0, "step": 1964, "train/ce_loss": 0.549247682094574 }, { "epoch": 0.19418627644848724, "step": 1964, "train/sim_loss": 0.10546875 }, { "epoch": 0.19418627644848724, "step": 1964, "train/total_loss": 0.16039352118968964 }, { "entropy": 9.243106842041016, "epoch": 0.19428514929800278, "mean_token_accuracy": 0.7380688190460205, "num_tokens": 10798335.0, "step": 1965, "train/ce_loss": 1.0018572807312012 }, { "epoch": 0.19428514929800278, "step": 1965, "train/sim_loss": 0.05859375 }, { "epoch": 0.19428514929800278, "step": 1965, "train/total_loss": 0.15877947211265564 }, { "entropy": 9.076430320739746, "epoch": 0.1943840221475183, "mean_token_accuracy": 0.6976495981216431, "num_tokens": 10803891.0, "step": 1966, "train/ce_loss": 1.0205624103546143 }, { "epoch": 0.1943840221475183, "step": 1966, "train/sim_loss": 0.1328125 }, { "epoch": 0.1943840221475183, "step": 1966, "train/total_loss": 0.23486873507499695 }, { "entropy": 9.57259750366211, "epoch": 0.1944828949970338, "mean_token_accuracy": 0.798258364200592, "num_tokens": 10809180.0, "step": 1967, "train/ce_loss": 0.4768856465816498 }, { "epoch": 0.1944828949970338, "step": 1967, "train/sim_loss": 0.05078125 }, { "epoch": 0.1944828949970338, "step": 1967, "train/total_loss": 0.0984698161482811 }, { "entropy": 8.855634689331055, "epoch": 0.19458176784654935, "mean_token_accuracy": 0.7044079303741455, "num_tokens": 10814888.0, "step": 1968, "train/ce_loss": 2.221090078353882 }, { "epoch": 0.19458176784654935, "step": 1968, "train/sim_loss": 0.0625 }, { "epoch": 0.19458176784654935, "step": 1968, "train/total_loss": 0.28460901975631714 }, { "entropy": 9.434700965881348, "epoch": 0.19468064069606486, "mean_token_accuracy": 0.7381545901298523, "num_tokens": 10820238.0, "step": 1969, "train/ce_loss": 0.5119531154632568 }, { "epoch": 0.19468064069606486, "step": 1969, "train/sim_loss": 0.02734375 }, { "epoch": 0.19468064069606486, "step": 1969, "train/total_loss": 0.07853905856609344 }, { "entropy": 9.202564239501953, "epoch": 0.19477951354558037, "mean_token_accuracy": 0.7771797776222229, "num_tokens": 10825742.0, "step": 1970, "train/ce_loss": 0.4919135868549347 }, { "epoch": 0.19477951354558037, "step": 1970, "train/sim_loss": 0.0625 }, { "epoch": 0.19477951354558037, "step": 1970, "train/total_loss": 0.11169135570526123 }, { "entropy": 8.828511238098145, "epoch": 0.19487838639509591, "mean_token_accuracy": 0.7057335376739502, "num_tokens": 10831560.0, "step": 1971, "train/ce_loss": 0.9558663368225098 }, { "epoch": 0.19487838639509591, "step": 1971, "train/sim_loss": 0.0703125 }, { "epoch": 0.19487838639509591, "step": 1971, "train/total_loss": 0.1658991277217865 }, { "entropy": 9.279796600341797, "epoch": 0.19497725924461143, "mean_token_accuracy": 0.6903614401817322, "num_tokens": 10837040.0, "step": 1972, "train/ce_loss": 0.4941887855529785 }, { "epoch": 0.19497725924461143, "step": 1972, "train/sim_loss": 0.0546875 }, { "epoch": 0.19497725924461143, "step": 1972, "train/total_loss": 0.10410638153553009 }, { "entropy": 9.404963493347168, "epoch": 0.19507613209412694, "mean_token_accuracy": 0.7805194854736328, "num_tokens": 10842392.0, "step": 1973, "train/ce_loss": 0.8931805491447449 }, { "epoch": 0.19507613209412694, "step": 1973, "train/sim_loss": 0.0703125 }, { "epoch": 0.19507613209412694, "step": 1973, "train/total_loss": 0.15963056683540344 }, { "entropy": 9.100309371948242, "epoch": 0.19517500494364248, "mean_token_accuracy": 0.725853681564331, "num_tokens": 10847982.0, "step": 1974, "train/ce_loss": 0.7615547776222229 }, { "epoch": 0.19517500494364248, "step": 1974, "train/sim_loss": 0.04296875 }, { "epoch": 0.19517500494364248, "step": 1974, "train/total_loss": 0.11912422627210617 }, { "entropy": 9.449594497680664, "epoch": 0.195273877793158, "mean_token_accuracy": 0.7480519413948059, "num_tokens": 10853306.0, "step": 1975, "train/ce_loss": 0.761044979095459 }, { "epoch": 0.195273877793158, "step": 1975, "train/sim_loss": 0.06640625 }, { "epoch": 0.195273877793158, "step": 1975, "train/total_loss": 0.14251074194908142 }, { "entropy": 8.721174240112305, "epoch": 0.1953727506426735, "mean_token_accuracy": 0.7105058431625366, "num_tokens": 10859165.0, "step": 1976, "train/ce_loss": 0.5387590527534485 }, { "epoch": 0.1953727506426735, "step": 1976, "train/sim_loss": 0.09375 }, { "epoch": 0.1953727506426735, "step": 1976, "train/total_loss": 0.1476259082555771 }, { "entropy": 9.225479125976562, "epoch": 0.19547162349218905, "mean_token_accuracy": 0.770165741443634, "num_tokens": 10864664.0, "step": 1977, "train/ce_loss": 0.895075798034668 }, { "epoch": 0.19547162349218905, "step": 1977, "train/sim_loss": 0.046875 }, { "epoch": 0.19547162349218905, "step": 1977, "train/total_loss": 0.1363825798034668 }, { "entropy": 9.248018264770508, "epoch": 0.19557049634170456, "mean_token_accuracy": 0.7307262420654297, "num_tokens": 10870173.0, "step": 1978, "train/ce_loss": 1.3531497716903687 }, { "epoch": 0.19557049634170456, "step": 1978, "train/sim_loss": 0.12890625 }, { "epoch": 0.19557049634170456, "step": 1978, "train/total_loss": 0.2642212510108948 }, { "entropy": 9.388627052307129, "epoch": 0.19566936919122008, "mean_token_accuracy": 0.8038976788520813, "num_tokens": 10875532.0, "step": 1979, "train/ce_loss": 0.5621894001960754 }, { "epoch": 0.19566936919122008, "step": 1979, "train/sim_loss": 0.02734375 }, { "epoch": 0.19566936919122008, "step": 1979, "train/total_loss": 0.0835626870393753 }, { "epoch": 0.19576824204073562, "grad_norm": 0.7459708452224731, "learning_rate": 9.513178064579934e-06, "loss": 0.149, "step": 1980 }, { "entropy": 9.421159744262695, "epoch": 0.19576824204073562, "mean_token_accuracy": 0.7195122241973877, "num_tokens": 10880931.0, "step": 1980, "train/ce_loss": 0.8020856380462646 }, { "epoch": 0.19576824204073562, "step": 1980, "train/sim_loss": 0.1015625 }, { "epoch": 0.19576824204073562, "step": 1980, "train/total_loss": 0.18177106976509094 }, { "entropy": 9.171142578125, "epoch": 0.19586711489025113, "mean_token_accuracy": 0.6482465267181396, "num_tokens": 10886463.0, "step": 1981, "train/ce_loss": 0.5439223051071167 }, { "epoch": 0.19586711489025113, "step": 1981, "train/sim_loss": 0.05859375 }, { "epoch": 0.19586711489025113, "step": 1981, "train/total_loss": 0.11298598349094391 }, { "entropy": 9.286457061767578, "epoch": 0.19596598773976667, "mean_token_accuracy": 0.6901072859764099, "num_tokens": 10891937.0, "step": 1982, "train/ce_loss": 1.667113184928894 }, { "epoch": 0.19596598773976667, "step": 1982, "train/sim_loss": 0.1015625 }, { "epoch": 0.19596598773976667, "step": 1982, "train/total_loss": 0.26827383041381836 }, { "entropy": 9.385889053344727, "epoch": 0.19606486058928219, "mean_token_accuracy": 0.7383720874786377, "num_tokens": 10897439.0, "step": 1983, "train/ce_loss": 0.5580955147743225 }, { "epoch": 0.19606486058928219, "step": 1983, "train/sim_loss": 0.02734375 }, { "epoch": 0.19606486058928219, "step": 1983, "train/total_loss": 0.08315330743789673 }, { "entropy": 9.158814430236816, "epoch": 0.1961637334387977, "mean_token_accuracy": 0.799578070640564, "num_tokens": 10902952.0, "step": 1984, "train/ce_loss": 0.6215584874153137 }, { "epoch": 0.1961637334387977, "step": 1984, "train/sim_loss": 0.02734375 }, { "epoch": 0.1961637334387977, "step": 1984, "train/total_loss": 0.08949960023164749 }, { "entropy": 9.103620529174805, "epoch": 0.19626260628831324, "mean_token_accuracy": 0.7744680643081665, "num_tokens": 10908599.0, "step": 1985, "train/ce_loss": 0.5834195017814636 }, { "epoch": 0.19626260628831324, "step": 1985, "train/sim_loss": 0.046875 }, { "epoch": 0.19626260628831324, "step": 1985, "train/total_loss": 0.10521695017814636 }, { "entropy": 9.391633033752441, "epoch": 0.19636147913782875, "mean_token_accuracy": 0.8110831379890442, "num_tokens": 10913983.0, "step": 1986, "train/ce_loss": 0.5818746089935303 }, { "epoch": 0.19636147913782875, "step": 1986, "train/sim_loss": 0.0546875 }, { "epoch": 0.19636147913782875, "step": 1986, "train/total_loss": 0.11287496238946915 }, { "entropy": 9.072154998779297, "epoch": 0.19646035198734427, "mean_token_accuracy": 0.7413793206214905, "num_tokens": 10919716.0, "step": 1987, "train/ce_loss": 0.9707176089286804 }, { "epoch": 0.19646035198734427, "step": 1987, "train/sim_loss": 0.12890625 }, { "epoch": 0.19646035198734427, "step": 1987, "train/total_loss": 0.22597801685333252 }, { "entropy": 9.0404634475708, "epoch": 0.1965592248368598, "mean_token_accuracy": 0.8209999799728394, "num_tokens": 10925375.0, "step": 1988, "train/ce_loss": 0.37624597549438477 }, { "epoch": 0.1965592248368598, "step": 1988, "train/sim_loss": 0.02734375 }, { "epoch": 0.1965592248368598, "step": 1988, "train/total_loss": 0.06496834754943848 }, { "entropy": 9.276455879211426, "epoch": 0.19665809768637532, "mean_token_accuracy": 0.7312430143356323, "num_tokens": 10930823.0, "step": 1989, "train/ce_loss": 0.38979750871658325 }, { "epoch": 0.19665809768637532, "step": 1989, "train/sim_loss": 0.109375 }, { "epoch": 0.19665809768637532, "step": 1989, "train/total_loss": 0.14835475385189056 }, { "entropy": 9.257169723510742, "epoch": 0.19675697053589083, "mean_token_accuracy": 0.7955865263938904, "num_tokens": 10936323.0, "step": 1990, "train/ce_loss": 0.878537654876709 }, { "epoch": 0.19675697053589083, "step": 1990, "train/sim_loss": 0.078125 }, { "epoch": 0.19675697053589083, "step": 1990, "train/total_loss": 0.16597875952720642 }, { "entropy": 9.021524429321289, "epoch": 0.19685584338540638, "mean_token_accuracy": 0.7347347140312195, "num_tokens": 10941941.0, "step": 1991, "train/ce_loss": 0.8852999806404114 }, { "epoch": 0.19685584338540638, "step": 1991, "train/sim_loss": 0.0703125 }, { "epoch": 0.19685584338540638, "step": 1991, "train/total_loss": 0.15884250402450562 }, { "entropy": 9.496986389160156, "epoch": 0.1969547162349219, "mean_token_accuracy": 0.7510489225387573, "num_tokens": 10947288.0, "step": 1992, "train/ce_loss": 0.4222087860107422 }, { "epoch": 0.1969547162349219, "step": 1992, "train/sim_loss": 0.03125 }, { "epoch": 0.1969547162349219, "step": 1992, "train/total_loss": 0.07347087562084198 }, { "entropy": 9.44383716583252, "epoch": 0.1970535890844374, "mean_token_accuracy": 0.7636594772338867, "num_tokens": 10952688.0, "step": 1993, "train/ce_loss": 0.7706331610679626 }, { "epoch": 0.1970535890844374, "step": 1993, "train/sim_loss": 0.12109375 }, { "epoch": 0.1970535890844374, "step": 1993, "train/total_loss": 0.19815707206726074 }, { "entropy": 9.015460968017578, "epoch": 0.19715246193395294, "mean_token_accuracy": 0.6983805894851685, "num_tokens": 10958316.0, "step": 1994, "train/ce_loss": 0.7875050902366638 }, { "epoch": 0.19715246193395294, "step": 1994, "train/sim_loss": 0.109375 }, { "epoch": 0.19715246193395294, "step": 1994, "train/total_loss": 0.18812552094459534 }, { "entropy": 9.162254333496094, "epoch": 0.19725133478346846, "mean_token_accuracy": 0.71517413854599, "num_tokens": 10963758.0, "step": 1995, "train/ce_loss": 1.1633720397949219 }, { "epoch": 0.19725133478346846, "step": 1995, "train/sim_loss": 0.1015625 }, { "epoch": 0.19725133478346846, "step": 1995, "train/total_loss": 0.21789970993995667 }, { "entropy": 9.462641716003418, "epoch": 0.19735020763298397, "mean_token_accuracy": 0.6871727705001831, "num_tokens": 10969110.0, "step": 1996, "train/ce_loss": 1.0915312767028809 }, { "epoch": 0.19735020763298397, "step": 1996, "train/sim_loss": 0.125 }, { "epoch": 0.19735020763298397, "step": 1996, "train/total_loss": 0.2341531217098236 }, { "entropy": 9.23176097869873, "epoch": 0.1974490804824995, "mean_token_accuracy": 0.7879133224487305, "num_tokens": 10974599.0, "step": 1997, "train/ce_loss": 0.45035725831985474 }, { "epoch": 0.1974490804824995, "step": 1997, "train/sim_loss": 0.06640625 }, { "epoch": 0.1974490804824995, "step": 1997, "train/total_loss": 0.11144197732210159 }, { "entropy": 8.977462768554688, "epoch": 0.19754795333201502, "mean_token_accuracy": 0.7513116598129272, "num_tokens": 10980166.0, "step": 1998, "train/ce_loss": 0.8542543053627014 }, { "epoch": 0.19754795333201502, "step": 1998, "train/sim_loss": 0.09375 }, { "epoch": 0.19754795333201502, "step": 1998, "train/total_loss": 0.17917543649673462 }, { "entropy": 9.44588851928711, "epoch": 0.19764682618153054, "mean_token_accuracy": 0.7206771373748779, "num_tokens": 10985537.0, "step": 1999, "train/ce_loss": 1.0001276731491089 }, { "epoch": 0.19764682618153054, "step": 1999, "train/sim_loss": 0.09765625 }, { "epoch": 0.19764682618153054, "step": 1999, "train/total_loss": 0.19766902923583984 }, { "epoch": 0.19774569903104608, "grad_norm": 0.871808648109436, "learning_rate": 9.508233199821986e-06, "loss": 0.1543, "step": 2000 }, { "entropy": 9.304159164428711, "epoch": 0.19774569903104608, "mean_token_accuracy": 0.783687949180603, "num_tokens": 10990982.0, "step": 2000, "train/ce_loss": 0.7165433764457703 }, { "epoch": 0.19774569903104608, "step": 2000, "train/sim_loss": 0.05859375 }, { "epoch": 0.19774569903104608, "step": 2000, "train/total_loss": 0.13024809956550598 }, { "entropy": 9.435250282287598, "epoch": 0.1978445718805616, "mean_token_accuracy": 0.7446504831314087, "num_tokens": 10996296.0, "step": 2001, "train/ce_loss": 0.8298952579498291 }, { "epoch": 0.1978445718805616, "step": 2001, "train/sim_loss": 0.06640625 }, { "epoch": 0.1978445718805616, "step": 2001, "train/total_loss": 0.14939577877521515 }, { "entropy": 9.367655754089355, "epoch": 0.19794344473007713, "mean_token_accuracy": 0.7557840347290039, "num_tokens": 11001708.0, "step": 2002, "train/ce_loss": 0.9128147959709167 }, { "epoch": 0.19794344473007713, "step": 2002, "train/sim_loss": 0.11328125 }, { "epoch": 0.19794344473007713, "step": 2002, "train/total_loss": 0.2045627236366272 }, { "entropy": 9.520770072937012, "epoch": 0.19804231757959265, "mean_token_accuracy": 0.7601042985916138, "num_tokens": 11007105.0, "step": 2003, "train/ce_loss": 1.1636344194412231 }, { "epoch": 0.19804231757959265, "step": 2003, "train/sim_loss": 0.10546875 }, { "epoch": 0.19804231757959265, "step": 2003, "train/total_loss": 0.22183218598365784 }, { "entropy": 9.077766418457031, "epoch": 0.19814119042910816, "mean_token_accuracy": 0.7861701846122742, "num_tokens": 11012724.0, "step": 2004, "train/ce_loss": 0.7944422960281372 }, { "epoch": 0.19814119042910816, "step": 2004, "train/sim_loss": 0.0546875 }, { "epoch": 0.19814119042910816, "step": 2004, "train/total_loss": 0.13413172960281372 }, { "entropy": 9.233171463012695, "epoch": 0.1982400632786237, "mean_token_accuracy": 0.7933094501495361, "num_tokens": 11018141.0, "step": 2005, "train/ce_loss": 0.6822364926338196 }, { "epoch": 0.1982400632786237, "step": 2005, "train/sim_loss": 0.08203125 }, { "epoch": 0.1982400632786237, "step": 2005, "train/total_loss": 0.15025490522384644 }, { "entropy": 9.307855606079102, "epoch": 0.19833893612813921, "mean_token_accuracy": 0.7979568839073181, "num_tokens": 11023805.0, "step": 2006, "train/ce_loss": 0.48593252897262573 }, { "epoch": 0.19833893612813921, "step": 2006, "train/sim_loss": 0.1015625 }, { "epoch": 0.19833893612813921, "step": 2006, "train/total_loss": 0.15015575289726257 }, { "entropy": 8.890583992004395, "epoch": 0.19843780897765473, "mean_token_accuracy": 0.7144067883491516, "num_tokens": 11029573.0, "step": 2007, "train/ce_loss": 0.7477504014968872 }, { "epoch": 0.19843780897765473, "step": 2007, "train/sim_loss": 0.09765625 }, { "epoch": 0.19843780897765473, "step": 2007, "train/total_loss": 0.17243129014968872 }, { "entropy": 9.064579010009766, "epoch": 0.19853668182717027, "mean_token_accuracy": 0.7509434223175049, "num_tokens": 11035191.0, "step": 2008, "train/ce_loss": 0.415785014629364 }, { "epoch": 0.19853668182717027, "step": 2008, "train/sim_loss": 0.01953125 }, { "epoch": 0.19853668182717027, "step": 2008, "train/total_loss": 0.0611097514629364 }, { "entropy": 9.590108871459961, "epoch": 0.19863555467668578, "mean_token_accuracy": 0.7562776803970337, "num_tokens": 11040490.0, "step": 2009, "train/ce_loss": 1.3896701335906982 }, { "epoch": 0.19863555467668578, "step": 2009, "train/sim_loss": 0.11328125 }, { "epoch": 0.19863555467668578, "step": 2009, "train/total_loss": 0.25224828720092773 }, { "entropy": 8.875904083251953, "epoch": 0.1987344275262013, "mean_token_accuracy": 0.7709497213363647, "num_tokens": 11046217.0, "step": 2010, "train/ce_loss": 0.5858079791069031 }, { "epoch": 0.1987344275262013, "step": 2010, "train/sim_loss": 0.03125 }, { "epoch": 0.1987344275262013, "step": 2010, "train/total_loss": 0.08983080089092255 }, { "entropy": 9.062847137451172, "epoch": 0.19883330037571684, "mean_token_accuracy": 0.7620000243186951, "num_tokens": 11051796.0, "step": 2011, "train/ce_loss": 0.43980568647384644 }, { "epoch": 0.19883330037571684, "step": 2011, "train/sim_loss": 0.02734375 }, { "epoch": 0.19883330037571684, "step": 2011, "train/total_loss": 0.07132431864738464 }, { "entropy": 9.067466735839844, "epoch": 0.19893217322523235, "mean_token_accuracy": 0.7127771973609924, "num_tokens": 11057329.0, "step": 2012, "train/ce_loss": 0.7592368721961975 }, { "epoch": 0.19893217322523235, "step": 2012, "train/sim_loss": 0.046875 }, { "epoch": 0.19893217322523235, "step": 2012, "train/total_loss": 0.12279868870973587 }, { "entropy": 9.32968521118164, "epoch": 0.19903104607474786, "mean_token_accuracy": 0.7732884287834167, "num_tokens": 11062733.0, "step": 2013, "train/ce_loss": 1.7065942287445068 }, { "epoch": 0.19903104607474786, "step": 2013, "train/sim_loss": 0.08984375 }, { "epoch": 0.19903104607474786, "step": 2013, "train/total_loss": 0.2605031728744507 }, { "entropy": 9.241215705871582, "epoch": 0.1991299189242634, "mean_token_accuracy": 0.7330960631370544, "num_tokens": 11068217.0, "step": 2014, "train/ce_loss": 1.2902578115463257 }, { "epoch": 0.1991299189242634, "step": 2014, "train/sim_loss": 0.07421875 }, { "epoch": 0.1991299189242634, "step": 2014, "train/total_loss": 0.20324453711509705 }, { "entropy": 9.170868873596191, "epoch": 0.19922879177377892, "mean_token_accuracy": 0.7217572927474976, "num_tokens": 11073747.0, "step": 2015, "train/ce_loss": 1.382637858390808 }, { "epoch": 0.19922879177377892, "step": 2015, "train/sim_loss": 0.06640625 }, { "epoch": 0.19922879177377892, "step": 2015, "train/total_loss": 0.2046700417995453 }, { "entropy": 9.2724609375, "epoch": 0.19932766462329443, "mean_token_accuracy": 0.7794432640075684, "num_tokens": 11079228.0, "step": 2016, "train/ce_loss": 0.515058159828186 }, { "epoch": 0.19932766462329443, "step": 2016, "train/sim_loss": 0.03125 }, { "epoch": 0.19932766462329443, "step": 2016, "train/total_loss": 0.08275581896305084 }, { "entropy": 8.966962814331055, "epoch": 0.19942653747280997, "mean_token_accuracy": 0.7830188870429993, "num_tokens": 11084889.0, "step": 2017, "train/ce_loss": 0.5813071727752686 }, { "epoch": 0.19942653747280997, "step": 2017, "train/sim_loss": 0.0625 }, { "epoch": 0.19942653747280997, "step": 2017, "train/total_loss": 0.12063071876764297 }, { "entropy": 8.407726287841797, "epoch": 0.19952541032232549, "mean_token_accuracy": 0.6813757419586182, "num_tokens": 11091011.0, "step": 2018, "train/ce_loss": 1.5650999546051025 }, { "epoch": 0.19952541032232549, "step": 2018, "train/sim_loss": 0.04296875 }, { "epoch": 0.19952541032232549, "step": 2018, "train/total_loss": 0.19947874546051025 }, { "entropy": 9.245950698852539, "epoch": 0.199624283171841, "mean_token_accuracy": 0.7423167824745178, "num_tokens": 11096387.0, "step": 2019, "train/ce_loss": 0.5987744331359863 }, { "epoch": 0.199624283171841, "step": 2019, "train/sim_loss": 0.0234375 }, { "epoch": 0.199624283171841, "step": 2019, "train/total_loss": 0.0833149403333664 }, { "epoch": 0.19972315602135654, "grad_norm": 0.7820895314216614, "learning_rate": 9.503288335064036e-06, "loss": 0.1472, "step": 2020 }, { "entropy": 8.962968826293945, "epoch": 0.19972315602135654, "mean_token_accuracy": 0.7775570750236511, "num_tokens": 11102044.0, "step": 2020, "train/ce_loss": 0.6894441843032837 }, { "epoch": 0.19972315602135654, "step": 2020, "train/sim_loss": 0.0625 }, { "epoch": 0.19972315602135654, "step": 2020, "train/total_loss": 0.13144442439079285 }, { "entropy": 9.337799072265625, "epoch": 0.19982202887087205, "mean_token_accuracy": 0.7723112106323242, "num_tokens": 11107536.0, "step": 2021, "train/ce_loss": 0.636396586894989 }, { "epoch": 0.19982202887087205, "step": 2021, "train/sim_loss": 0.05078125 }, { "epoch": 0.19982202887087205, "step": 2021, "train/total_loss": 0.11442091315984726 }, { "entropy": 8.874029159545898, "epoch": 0.1999209017203876, "mean_token_accuracy": 0.8050458431243896, "num_tokens": 11113025.0, "step": 2022, "train/ce_loss": 0.46313953399658203 }, { "epoch": 0.1999209017203876, "step": 2022, "train/sim_loss": 0.02734375 }, { "epoch": 0.1999209017203876, "step": 2022, "train/total_loss": 0.07365770637989044 }, { "entropy": 8.818004608154297, "epoch": 0.2000197745699031, "mean_token_accuracy": 0.7597571611404419, "num_tokens": 11118778.0, "step": 2023, "train/ce_loss": 0.5931119322776794 }, { "epoch": 0.2000197745699031, "step": 2023, "train/sim_loss": 0.06640625 }, { "epoch": 0.2000197745699031, "step": 2023, "train/total_loss": 0.12571744620800018 }, { "entropy": 9.553092956542969, "epoch": 0.20011864741941862, "mean_token_accuracy": 0.6993007063865662, "num_tokens": 11124088.0, "step": 2024, "train/ce_loss": 0.9076825976371765 }, { "epoch": 0.20011864741941862, "step": 2024, "train/sim_loss": 0.02734375 }, { "epoch": 0.20011864741941862, "step": 2024, "train/total_loss": 0.11811201274394989 }, { "entropy": 9.228398323059082, "epoch": 0.20021752026893416, "mean_token_accuracy": 0.7909930944442749, "num_tokens": 11129573.0, "step": 2025, "train/ce_loss": 0.6429710984230042 }, { "epoch": 0.20021752026893416, "step": 2025, "train/sim_loss": 0.0859375 }, { "epoch": 0.20021752026893416, "step": 2025, "train/total_loss": 0.15023460984230042 }, { "entropy": 9.172475814819336, "epoch": 0.20031639311844968, "mean_token_accuracy": 0.7584033608436584, "num_tokens": 11135063.0, "step": 2026, "train/ce_loss": 0.549929678440094 }, { "epoch": 0.20031639311844968, "step": 2026, "train/sim_loss": 0.01953125 }, { "epoch": 0.20031639311844968, "step": 2026, "train/total_loss": 0.07452422380447388 }, { "entropy": 8.86240005493164, "epoch": 0.2004152659679652, "mean_token_accuracy": 0.7551229596138, "num_tokens": 11140705.0, "step": 2027, "train/ce_loss": 0.5344739556312561 }, { "epoch": 0.2004152659679652, "step": 2027, "train/sim_loss": 0.1015625 }, { "epoch": 0.2004152659679652, "step": 2027, "train/total_loss": 0.1550098955631256 }, { "entropy": 9.288492202758789, "epoch": 0.20051413881748073, "mean_token_accuracy": 0.7414448857307434, "num_tokens": 11146168.0, "step": 2028, "train/ce_loss": 0.8897226452827454 }, { "epoch": 0.20051413881748073, "step": 2028, "train/sim_loss": 0.07421875 }, { "epoch": 0.20051413881748073, "step": 2028, "train/total_loss": 0.163191020488739 }, { "entropy": 9.101375579833984, "epoch": 0.20061301166699624, "mean_token_accuracy": 0.714628279209137, "num_tokens": 11151677.0, "step": 2029, "train/ce_loss": 0.8041356801986694 }, { "epoch": 0.20061301166699624, "step": 2029, "train/sim_loss": 0.1015625 }, { "epoch": 0.20061301166699624, "step": 2029, "train/total_loss": 0.1819760799407959 }, { "entropy": 9.378827095031738, "epoch": 0.20071188451651176, "mean_token_accuracy": 0.7607192397117615, "num_tokens": 11156950.0, "step": 2030, "train/ce_loss": 0.5256331562995911 }, { "epoch": 0.20071188451651176, "step": 2030, "train/sim_loss": 0.02734375 }, { "epoch": 0.20071188451651176, "step": 2030, "train/total_loss": 0.07990706712007523 }, { "entropy": 9.493762969970703, "epoch": 0.2008107573660273, "mean_token_accuracy": 0.7103658318519592, "num_tokens": 11162252.0, "step": 2031, "train/ce_loss": 1.22469961643219 }, { "epoch": 0.2008107573660273, "step": 2031, "train/sim_loss": 0.1484375 }, { "epoch": 0.2008107573660273, "step": 2031, "train/total_loss": 0.270907461643219 }, { "entropy": 9.454059600830078, "epoch": 0.2009096302155428, "mean_token_accuracy": 0.7443267703056335, "num_tokens": 11167523.0, "step": 2032, "train/ce_loss": 0.6316995024681091 }, { "epoch": 0.2009096302155428, "step": 2032, "train/sim_loss": 0.0703125 }, { "epoch": 0.2009096302155428, "step": 2032, "train/total_loss": 0.1334824562072754 }, { "entropy": 9.304862976074219, "epoch": 0.20100850306505832, "mean_token_accuracy": 0.7479674816131592, "num_tokens": 11172971.0, "step": 2033, "train/ce_loss": 1.3034157752990723 }, { "epoch": 0.20100850306505832, "step": 2033, "train/sim_loss": 0.08984375 }, { "epoch": 0.20100850306505832, "step": 2033, "train/total_loss": 0.220185324549675 }, { "entropy": 9.021543502807617, "epoch": 0.20110737591457387, "mean_token_accuracy": 0.725806474685669, "num_tokens": 11178635.0, "step": 2034, "train/ce_loss": 1.4683451652526855 }, { "epoch": 0.20110737591457387, "step": 2034, "train/sim_loss": 0.09765625 }, { "epoch": 0.20110737591457387, "step": 2034, "train/total_loss": 0.24449077248573303 }, { "entropy": 9.3250732421875, "epoch": 0.20120624876408938, "mean_token_accuracy": 0.7185184955596924, "num_tokens": 11184087.0, "step": 2035, "train/ce_loss": 0.6061079502105713 }, { "epoch": 0.20120624876408938, "step": 2035, "train/sim_loss": 0.03515625 }, { "epoch": 0.20120624876408938, "step": 2035, "train/total_loss": 0.0957670509815216 }, { "entropy": 9.218039512634277, "epoch": 0.2013051216136049, "mean_token_accuracy": 0.688720166683197, "num_tokens": 11189638.0, "step": 2036, "train/ce_loss": 0.6995318531990051 }, { "epoch": 0.2013051216136049, "step": 2036, "train/sim_loss": 0.05078125 }, { "epoch": 0.2013051216136049, "step": 2036, "train/total_loss": 0.12073443830013275 }, { "entropy": 9.170981407165527, "epoch": 0.20140399446312043, "mean_token_accuracy": 0.7375518679618835, "num_tokens": 11195187.0, "step": 2037, "train/ce_loss": 0.8873992562294006 }, { "epoch": 0.20140399446312043, "step": 2037, "train/sim_loss": 0.09765625 }, { "epoch": 0.20140399446312043, "step": 2037, "train/total_loss": 0.18639618158340454 }, { "entropy": 9.287162780761719, "epoch": 0.20150286731263595, "mean_token_accuracy": 0.7440699338912964, "num_tokens": 11200606.0, "step": 2038, "train/ce_loss": 1.0921783447265625 }, { "epoch": 0.20150286731263595, "step": 2038, "train/sim_loss": 0.0859375 }, { "epoch": 0.20150286731263595, "step": 2038, "train/total_loss": 0.1951553374528885 }, { "entropy": 8.93859577178955, "epoch": 0.20160174016215146, "mean_token_accuracy": 0.7406014800071716, "num_tokens": 11206346.0, "step": 2039, "train/ce_loss": 0.9606455564498901 }, { "epoch": 0.20160174016215146, "step": 2039, "train/sim_loss": 0.05078125 }, { "epoch": 0.20160174016215146, "step": 2039, "train/total_loss": 0.14684581756591797 }, { "epoch": 0.201700613011667, "grad_norm": 0.831764817237854, "learning_rate": 9.498343470306089e-06, "loss": 0.1508, "step": 2040 }, { "entropy": 9.136877059936523, "epoch": 0.201700613011667, "mean_token_accuracy": 0.7172414064407349, "num_tokens": 11211892.0, "step": 2040, "train/ce_loss": 0.7893755435943604 }, { "epoch": 0.201700613011667, "step": 2040, "train/sim_loss": 0.05859375 }, { "epoch": 0.201700613011667, "step": 2040, "train/total_loss": 0.1375313103199005 }, { "entropy": 9.249038696289062, "epoch": 0.20179948586118251, "mean_token_accuracy": 0.7275000214576721, "num_tokens": 11217291.0, "step": 2041, "train/ce_loss": 0.8898202776908875 }, { "epoch": 0.20179948586118251, "step": 2041, "train/sim_loss": 0.14453125 }, { "epoch": 0.20179948586118251, "step": 2041, "train/total_loss": 0.23351328074932098 }, { "entropy": 9.123167037963867, "epoch": 0.20189835871069806, "mean_token_accuracy": 0.7458506226539612, "num_tokens": 11222861.0, "step": 2042, "train/ce_loss": 0.9041544198989868 }, { "epoch": 0.20189835871069806, "step": 2042, "train/sim_loss": 0.1015625 }, { "epoch": 0.20189835871069806, "step": 2042, "train/total_loss": 0.19197794795036316 }, { "entropy": 8.985126495361328, "epoch": 0.20199723156021357, "mean_token_accuracy": 0.7242366671562195, "num_tokens": 11228571.0, "step": 2043, "train/ce_loss": 0.5731794238090515 }, { "epoch": 0.20199723156021357, "step": 2043, "train/sim_loss": 0.06640625 }, { "epoch": 0.20199723156021357, "step": 2043, "train/total_loss": 0.12372419238090515 }, { "entropy": 9.357343673706055, "epoch": 0.20209610440972908, "mean_token_accuracy": 0.7699593901634216, "num_tokens": 11233960.0, "step": 2044, "train/ce_loss": 0.7353712320327759 }, { "epoch": 0.20209610440972908, "step": 2044, "train/sim_loss": 0.0546875 }, { "epoch": 0.20209610440972908, "step": 2044, "train/total_loss": 0.12822462618350983 }, { "entropy": 9.063076972961426, "epoch": 0.20219497725924462, "mean_token_accuracy": 0.701886773109436, "num_tokens": 11239334.0, "step": 2045, "train/ce_loss": 0.7584655284881592 }, { "epoch": 0.20219497725924462, "step": 2045, "train/sim_loss": 0.04296875 }, { "epoch": 0.20219497725924462, "step": 2045, "train/total_loss": 0.11881530284881592 }, { "entropy": 9.33730697631836, "epoch": 0.20229385010876014, "mean_token_accuracy": 0.6911057829856873, "num_tokens": 11244809.0, "step": 2046, "train/ce_loss": 0.6836140751838684 }, { "epoch": 0.20229385010876014, "step": 2046, "train/sim_loss": 0.0625 }, { "epoch": 0.20229385010876014, "step": 2046, "train/total_loss": 0.13086140155792236 }, { "entropy": 9.341817855834961, "epoch": 0.20239272295827565, "mean_token_accuracy": 0.7617021203041077, "num_tokens": 11250140.0, "step": 2047, "train/ce_loss": 0.6595637202262878 }, { "epoch": 0.20239272295827565, "step": 2047, "train/sim_loss": 0.02734375 }, { "epoch": 0.20239272295827565, "step": 2047, "train/total_loss": 0.09330012649297714 }, { "entropy": 9.328086853027344, "epoch": 0.2024915958077912, "mean_token_accuracy": 0.7482678890228271, "num_tokens": 11255589.0, "step": 2048, "train/ce_loss": 0.45071882009506226 }, { "epoch": 0.2024915958077912, "step": 2048, "train/sim_loss": 0.05078125 }, { "epoch": 0.2024915958077912, "step": 2048, "train/total_loss": 0.09585313498973846 }, { "entropy": 9.430686950683594, "epoch": 0.2025904686573067, "mean_token_accuracy": 0.7496723532676697, "num_tokens": 11260928.0, "step": 2049, "train/ce_loss": 0.7953136563301086 }, { "epoch": 0.2025904686573067, "step": 2049, "train/sim_loss": 0.0234375 }, { "epoch": 0.2025904686573067, "step": 2049, "train/total_loss": 0.10296886414289474 }, { "entropy": 9.289460182189941, "epoch": 0.20268934150682222, "mean_token_accuracy": 0.7084210515022278, "num_tokens": 11266504.0, "step": 2050, "train/ce_loss": 0.6762533783912659 }, { "epoch": 0.20268934150682222, "step": 2050, "train/sim_loss": 0.0546875 }, { "epoch": 0.20268934150682222, "step": 2050, "train/total_loss": 0.12231283634901047 }, { "entropy": 9.069010734558105, "epoch": 0.20278821435633776, "mean_token_accuracy": 0.8026607632637024, "num_tokens": 11272052.0, "step": 2051, "train/ce_loss": 0.38799282908439636 }, { "epoch": 0.20278821435633776, "step": 2051, "train/sim_loss": 0.03125 }, { "epoch": 0.20278821435633776, "step": 2051, "train/total_loss": 0.07004928588867188 }, { "entropy": 9.489070892333984, "epoch": 0.20288708720585327, "mean_token_accuracy": 0.7434482574462891, "num_tokens": 11277443.0, "step": 2052, "train/ce_loss": 1.3448429107666016 }, { "epoch": 0.20288708720585327, "step": 2052, "train/sim_loss": 0.07421875 }, { "epoch": 0.20288708720585327, "step": 2052, "train/total_loss": 0.20870304107666016 }, { "entropy": 9.449377059936523, "epoch": 0.20298596005536879, "mean_token_accuracy": 0.7355889678001404, "num_tokens": 11282851.0, "step": 2053, "train/ce_loss": 1.3907837867736816 }, { "epoch": 0.20298596005536879, "step": 2053, "train/sim_loss": 0.07421875 }, { "epoch": 0.20298596005536879, "step": 2053, "train/total_loss": 0.21329712867736816 }, { "entropy": 9.375255584716797, "epoch": 0.20308483290488433, "mean_token_accuracy": 0.7545787692070007, "num_tokens": 11288224.0, "step": 2054, "train/ce_loss": 0.697818398475647 }, { "epoch": 0.20308483290488433, "step": 2054, "train/sim_loss": 0.05859375 }, { "epoch": 0.20308483290488433, "step": 2054, "train/total_loss": 0.1283755898475647 }, { "entropy": 9.092023849487305, "epoch": 0.20318370575439984, "mean_token_accuracy": 0.6929922103881836, "num_tokens": 11293707.0, "step": 2055, "train/ce_loss": 0.8970068693161011 }, { "epoch": 0.20318370575439984, "step": 2055, "train/sim_loss": 0.1015625 }, { "epoch": 0.20318370575439984, "step": 2055, "train/total_loss": 0.19126319885253906 }, { "entropy": 9.305285453796387, "epoch": 0.20328257860391535, "mean_token_accuracy": 0.7187141180038452, "num_tokens": 11299188.0, "step": 2056, "train/ce_loss": 1.2095251083374023 }, { "epoch": 0.20328257860391535, "step": 2056, "train/sim_loss": 0.09765625 }, { "epoch": 0.20328257860391535, "step": 2056, "train/total_loss": 0.2186087667942047 }, { "entropy": 9.057624816894531, "epoch": 0.2033814514534309, "mean_token_accuracy": 0.7471153736114502, "num_tokens": 11304767.0, "step": 2057, "train/ce_loss": 0.5634765625 }, { "epoch": 0.2033814514534309, "step": 2057, "train/sim_loss": 0.03125 }, { "epoch": 0.2033814514534309, "step": 2057, "train/total_loss": 0.08759765326976776 }, { "entropy": 9.465644836425781, "epoch": 0.2034803243029464, "mean_token_accuracy": 0.738896369934082, "num_tokens": 11310124.0, "step": 2058, "train/ce_loss": 0.6026142835617065 }, { "epoch": 0.2034803243029464, "step": 2058, "train/sim_loss": 0.05859375 }, { "epoch": 0.2034803243029464, "step": 2058, "train/total_loss": 0.11885517835617065 }, { "entropy": 8.793779373168945, "epoch": 0.20357919715246192, "mean_token_accuracy": 0.7328385710716248, "num_tokens": 11315859.0, "step": 2059, "train/ce_loss": 1.0839051008224487 }, { "epoch": 0.20357919715246192, "step": 2059, "train/sim_loss": 0.0625 }, { "epoch": 0.20357919715246192, "step": 2059, "train/total_loss": 0.17089051008224487 }, { "epoch": 0.20367807000197746, "grad_norm": 1.0248448848724365, "learning_rate": 9.493398605548139e-06, "loss": 0.1602, "step": 2060 }, { "entropy": 9.237092971801758, "epoch": 0.20367807000197746, "mean_token_accuracy": 0.7895335555076599, "num_tokens": 11321537.0, "step": 2060, "train/ce_loss": 0.4037958085536957 }, { "epoch": 0.20367807000197746, "step": 2060, "train/sim_loss": 0.01953125 }, { "epoch": 0.20367807000197746, "step": 2060, "train/total_loss": 0.05991083011031151 }, { "entropy": 8.755851745605469, "epoch": 0.20377694285149298, "mean_token_accuracy": 0.7382155060768127, "num_tokens": 11327316.0, "step": 2061, "train/ce_loss": 1.3966518640518188 }, { "epoch": 0.20377694285149298, "step": 2061, "train/sim_loss": 0.07421875 }, { "epoch": 0.20377694285149298, "step": 2061, "train/total_loss": 0.21388393640518188 }, { "entropy": 8.856025695800781, "epoch": 0.2038758157010085, "mean_token_accuracy": 0.7152542471885681, "num_tokens": 11333141.0, "step": 2062, "train/ce_loss": 0.81052565574646 }, { "epoch": 0.2038758157010085, "step": 2062, "train/sim_loss": 0.09375 }, { "epoch": 0.2038758157010085, "step": 2062, "train/total_loss": 0.17480257153511047 }, { "entropy": 9.007176399230957, "epoch": 0.20397468855052403, "mean_token_accuracy": 0.778969943523407, "num_tokens": 11338687.0, "step": 2063, "train/ce_loss": 0.40870678424835205 }, { "epoch": 0.20397468855052403, "step": 2063, "train/sim_loss": 0.03125 }, { "epoch": 0.20397468855052403, "step": 2063, "train/total_loss": 0.07212068140506744 }, { "entropy": 9.202945709228516, "epoch": 0.20407356140003954, "mean_token_accuracy": 0.7002617716789246, "num_tokens": 11344087.0, "step": 2064, "train/ce_loss": 1.099848747253418 }, { "epoch": 0.20407356140003954, "step": 2064, "train/sim_loss": 0.078125 }, { "epoch": 0.20407356140003954, "step": 2064, "train/total_loss": 0.1881098747253418 }, { "entropy": 9.32137680053711, "epoch": 0.20417243424955508, "mean_token_accuracy": 0.7543859481811523, "num_tokens": 11349534.0, "step": 2065, "train/ce_loss": 0.5917319655418396 }, { "epoch": 0.20417243424955508, "step": 2065, "train/sim_loss": 0.06640625 }, { "epoch": 0.20417243424955508, "step": 2065, "train/total_loss": 0.12557944655418396 }, { "entropy": 8.676595687866211, "epoch": 0.2042713070990706, "mean_token_accuracy": 0.7445544600486755, "num_tokens": 11355161.0, "step": 2066, "train/ce_loss": 1.1337857246398926 }, { "epoch": 0.2042713070990706, "step": 2066, "train/sim_loss": 0.046875 }, { "epoch": 0.2042713070990706, "step": 2066, "train/total_loss": 0.1602535843849182 }, { "entropy": 9.207037925720215, "epoch": 0.2043701799485861, "mean_token_accuracy": 0.7054176330566406, "num_tokens": 11360710.0, "step": 2067, "train/ce_loss": 1.0236471891403198 }, { "epoch": 0.2043701799485861, "step": 2067, "train/sim_loss": 0.1484375 }, { "epoch": 0.2043701799485861, "step": 2067, "train/total_loss": 0.250802218914032 }, { "entropy": 9.101083755493164, "epoch": 0.20446905279810165, "mean_token_accuracy": 0.7180067896842957, "num_tokens": 11366184.0, "step": 2068, "train/ce_loss": 0.6752704977989197 }, { "epoch": 0.20446905279810165, "step": 2068, "train/sim_loss": 0.10546875 }, { "epoch": 0.20446905279810165, "step": 2068, "train/total_loss": 0.17299580574035645 }, { "entropy": 9.486696243286133, "epoch": 0.20456792564761717, "mean_token_accuracy": 0.7557544708251953, "num_tokens": 11371534.0, "step": 2069, "train/ce_loss": 0.8139979243278503 }, { "epoch": 0.20456792564761717, "step": 2069, "train/sim_loss": 0.08984375 }, { "epoch": 0.20456792564761717, "step": 2069, "train/total_loss": 0.1712435483932495 }, { "entropy": 9.263586044311523, "epoch": 0.20466679849713268, "mean_token_accuracy": 0.7773631811141968, "num_tokens": 11376998.0, "step": 2070, "train/ce_loss": 0.8640584349632263 }, { "epoch": 0.20466679849713268, "step": 2070, "train/sim_loss": 0.0859375 }, { "epoch": 0.20466679849713268, "step": 2070, "train/total_loss": 0.17234334349632263 }, { "entropy": 9.017020225524902, "epoch": 0.20476567134664822, "mean_token_accuracy": 0.7160621881484985, "num_tokens": 11382598.0, "step": 2071, "train/ce_loss": 0.8654956817626953 }, { "epoch": 0.20476567134664822, "step": 2071, "train/sim_loss": 0.12109375 }, { "epoch": 0.20476567134664822, "step": 2071, "train/total_loss": 0.2076433300971985 }, { "entropy": 9.468940734863281, "epoch": 0.20486454419616373, "mean_token_accuracy": 0.751724123954773, "num_tokens": 11387940.0, "step": 2072, "train/ce_loss": 0.6054278612136841 }, { "epoch": 0.20486454419616373, "step": 2072, "train/sim_loss": 0.0625 }, { "epoch": 0.20486454419616373, "step": 2072, "train/total_loss": 0.12304279208183289 }, { "entropy": 9.414447784423828, "epoch": 0.20496341704567925, "mean_token_accuracy": 0.7229729890823364, "num_tokens": 11393367.0, "step": 2073, "train/ce_loss": 0.6535100936889648 }, { "epoch": 0.20496341704567925, "step": 2073, "train/sim_loss": 0.08203125 }, { "epoch": 0.20496341704567925, "step": 2073, "train/total_loss": 0.14738225936889648 }, { "entropy": 9.57999038696289, "epoch": 0.2050622898951948, "mean_token_accuracy": 0.7168262600898743, "num_tokens": 11398653.0, "step": 2074, "train/ce_loss": 0.7539426684379578 }, { "epoch": 0.2050622898951948, "step": 2074, "train/sim_loss": 0.03515625 }, { "epoch": 0.2050622898951948, "step": 2074, "train/total_loss": 0.11055051535367966 }, { "entropy": 8.943816184997559, "epoch": 0.2051611627447103, "mean_token_accuracy": 0.7687007784843445, "num_tokens": 11404263.0, "step": 2075, "train/ce_loss": 0.7112745642662048 }, { "epoch": 0.2051611627447103, "step": 2075, "train/sim_loss": 0.05078125 }, { "epoch": 0.2051611627447103, "step": 2075, "train/total_loss": 0.12190870940685272 }, { "entropy": 9.255550384521484, "epoch": 0.20526003559422581, "mean_token_accuracy": 0.745476484298706, "num_tokens": 11409682.0, "step": 2076, "train/ce_loss": 0.6346203088760376 }, { "epoch": 0.20526003559422581, "step": 2076, "train/sim_loss": 0.0703125 }, { "epoch": 0.20526003559422581, "step": 2076, "train/total_loss": 0.133774533867836 }, { "entropy": 9.199384689331055, "epoch": 0.20535890844374136, "mean_token_accuracy": 0.7406616806983948, "num_tokens": 11415188.0, "step": 2077, "train/ce_loss": 0.841301441192627 }, { "epoch": 0.20535890844374136, "step": 2077, "train/sim_loss": 0.0234375 }, { "epoch": 0.20535890844374136, "step": 2077, "train/total_loss": 0.10756764560937881 }, { "entropy": 9.107648849487305, "epoch": 0.20545778129325687, "mean_token_accuracy": 0.7174940705299377, "num_tokens": 11420734.0, "step": 2078, "train/ce_loss": 0.8768755793571472 }, { "epoch": 0.20545778129325687, "step": 2078, "train/sim_loss": 0.0703125 }, { "epoch": 0.20545778129325687, "step": 2078, "train/total_loss": 0.15800005197525024 }, { "entropy": 8.94620132446289, "epoch": 0.20555665414277238, "mean_token_accuracy": 0.6978879570960999, "num_tokens": 11426432.0, "step": 2079, "train/ce_loss": 0.5795814394950867 }, { "epoch": 0.20555665414277238, "step": 2079, "train/sim_loss": 0.1171875 }, { "epoch": 0.20555665414277238, "step": 2079, "train/total_loss": 0.17514564096927643 }, { "epoch": 0.20565552699228792, "grad_norm": 1.136067271232605, "learning_rate": 9.48845374079019e-06, "loss": 0.1592, "step": 2080 }, { "entropy": 9.461761474609375, "epoch": 0.20565552699228792, "mean_token_accuracy": 0.7022696733474731, "num_tokens": 11431762.0, "step": 2080, "train/ce_loss": 1.1437028646469116 }, { "epoch": 0.20565552699228792, "step": 2080, "train/sim_loss": 0.0859375 }, { "epoch": 0.20565552699228792, "step": 2080, "train/total_loss": 0.20030778646469116 }, { "entropy": 9.001717567443848, "epoch": 0.20575439984180344, "mean_token_accuracy": 0.7960339784622192, "num_tokens": 11437457.0, "step": 2081, "train/ce_loss": 0.8532793521881104 }, { "epoch": 0.20575439984180344, "step": 2081, "train/sim_loss": 0.09375 }, { "epoch": 0.20575439984180344, "step": 2081, "train/total_loss": 0.17907793819904327 }, { "entropy": 9.302957534790039, "epoch": 0.20585327269131895, "mean_token_accuracy": 0.7199519276618958, "num_tokens": 11442871.0, "step": 2082, "train/ce_loss": 1.8868350982666016 }, { "epoch": 0.20585327269131895, "step": 2082, "train/sim_loss": 0.06640625 }, { "epoch": 0.20585327269131895, "step": 2082, "train/total_loss": 0.25508975982666016 }, { "entropy": 9.592981338500977, "epoch": 0.2059521455408345, "mean_token_accuracy": 0.739534854888916, "num_tokens": 11448073.0, "step": 2083, "train/ce_loss": 0.850782036781311 }, { "epoch": 0.2059521455408345, "step": 2083, "train/sim_loss": 0.08984375 }, { "epoch": 0.2059521455408345, "step": 2083, "train/total_loss": 0.17492195963859558 }, { "entropy": 9.291143417358398, "epoch": 0.20605101839035, "mean_token_accuracy": 0.750295877456665, "num_tokens": 11453547.0, "step": 2084, "train/ce_loss": 0.7265034914016724 }, { "epoch": 0.20605101839035, "step": 2084, "train/sim_loss": 0.0625 }, { "epoch": 0.20605101839035, "step": 2084, "train/total_loss": 0.13515034317970276 }, { "entropy": 8.939087867736816, "epoch": 0.20614989123986555, "mean_token_accuracy": 0.7485822439193726, "num_tokens": 11459233.0, "step": 2085, "train/ce_loss": 1.3212883472442627 }, { "epoch": 0.20614989123986555, "step": 2085, "train/sim_loss": 0.08984375 }, { "epoch": 0.20614989123986555, "step": 2085, "train/total_loss": 0.22197258472442627 }, { "entropy": 9.23195743560791, "epoch": 0.20624876408938106, "mean_token_accuracy": 0.7831050157546997, "num_tokens": 11464755.0, "step": 2086, "train/ce_loss": 0.8277024626731873 }, { "epoch": 0.20624876408938106, "step": 2086, "train/sim_loss": 0.0625 }, { "epoch": 0.20624876408938106, "step": 2086, "train/total_loss": 0.14527025818824768 }, { "entropy": 8.780197143554688, "epoch": 0.20634763693889657, "mean_token_accuracy": 0.7297777533531189, "num_tokens": 11470520.0, "step": 2087, "train/ce_loss": 0.7377793788909912 }, { "epoch": 0.20634763693889657, "step": 2087, "train/sim_loss": 0.078125 }, { "epoch": 0.20634763693889657, "step": 2087, "train/total_loss": 0.1519029438495636 }, { "entropy": 9.282990455627441, "epoch": 0.2064465097884121, "mean_token_accuracy": 0.7046459913253784, "num_tokens": 11476031.0, "step": 2088, "train/ce_loss": 0.946122407913208 }, { "epoch": 0.2064465097884121, "step": 2088, "train/sim_loss": 0.04296875 }, { "epoch": 0.2064465097884121, "step": 2088, "train/total_loss": 0.1375809907913208 }, { "entropy": 9.203896522521973, "epoch": 0.20654538263792763, "mean_token_accuracy": 0.6758794188499451, "num_tokens": 11481485.0, "step": 2089, "train/ce_loss": 0.7663823962211609 }, { "epoch": 0.20654538263792763, "step": 2089, "train/sim_loss": 0.046875 }, { "epoch": 0.20654538263792763, "step": 2089, "train/total_loss": 0.12351324409246445 }, { "entropy": 9.545255661010742, "epoch": 0.20664425548744314, "mean_token_accuracy": 0.7226666808128357, "num_tokens": 11486745.0, "step": 2090, "train/ce_loss": 0.9358912110328674 }, { "epoch": 0.20664425548744314, "step": 2090, "train/sim_loss": 0.09375 }, { "epoch": 0.20664425548744314, "step": 2090, "train/total_loss": 0.18733912706375122 }, { "entropy": 9.060846328735352, "epoch": 0.20674312833695868, "mean_token_accuracy": 0.8136125802993774, "num_tokens": 11492370.0, "step": 2091, "train/ce_loss": 0.6347329616546631 }, { "epoch": 0.20674312833695868, "step": 2091, "train/sim_loss": 0.0625 }, { "epoch": 0.20674312833695868, "step": 2091, "train/total_loss": 0.12597329914569855 }, { "entropy": 8.783164024353027, "epoch": 0.2068420011864742, "mean_token_accuracy": 0.7348877191543579, "num_tokens": 11498083.0, "step": 2092, "train/ce_loss": 0.7737459540367126 }, { "epoch": 0.2068420011864742, "step": 2092, "train/sim_loss": 0.0625 }, { "epoch": 0.2068420011864742, "step": 2092, "train/total_loss": 0.13987460732460022 }, { "entropy": 9.30455493927002, "epoch": 0.2069408740359897, "mean_token_accuracy": 0.7378410696983337, "num_tokens": 11503527.0, "step": 2093, "train/ce_loss": 0.7199686169624329 }, { "epoch": 0.2069408740359897, "step": 2093, "train/sim_loss": 0.078125 }, { "epoch": 0.2069408740359897, "step": 2093, "train/total_loss": 0.15012186765670776 }, { "entropy": 9.162153244018555, "epoch": 0.20703974688550525, "mean_token_accuracy": 0.7116336822509766, "num_tokens": 11508839.0, "step": 2094, "train/ce_loss": 0.4890272617340088 }, { "epoch": 0.20703974688550525, "step": 2094, "train/sim_loss": 0.0625 }, { "epoch": 0.20703974688550525, "step": 2094, "train/total_loss": 0.111402727663517 }, { "entropy": 9.310592651367188, "epoch": 0.20713861973502076, "mean_token_accuracy": 0.6944444179534912, "num_tokens": 11514229.0, "step": 2095, "train/ce_loss": 1.105270504951477 }, { "epoch": 0.20713861973502076, "step": 2095, "train/sim_loss": 0.06640625 }, { "epoch": 0.20713861973502076, "step": 2095, "train/total_loss": 0.17693330347537994 }, { "entropy": 9.047321319580078, "epoch": 0.20723749258453628, "mean_token_accuracy": 0.7349896430969238, "num_tokens": 11519678.0, "step": 2096, "train/ce_loss": 0.6104051470756531 }, { "epoch": 0.20723749258453628, "step": 2096, "train/sim_loss": 0.0546875 }, { "epoch": 0.20723749258453628, "step": 2096, "train/total_loss": 0.11572802066802979 }, { "entropy": 9.538384437561035, "epoch": 0.20733636543405182, "mean_token_accuracy": 0.7323529124259949, "num_tokens": 11525002.0, "step": 2097, "train/ce_loss": 0.7976276874542236 }, { "epoch": 0.20733636543405182, "step": 2097, "train/sim_loss": 0.09375 }, { "epoch": 0.20733636543405182, "step": 2097, "train/total_loss": 0.1735127717256546 }, { "entropy": 9.272768020629883, "epoch": 0.20743523828356733, "mean_token_accuracy": 0.7942643165588379, "num_tokens": 11530432.0, "step": 2098, "train/ce_loss": 0.7143188118934631 }, { "epoch": 0.20743523828356733, "step": 2098, "train/sim_loss": 0.09765625 }, { "epoch": 0.20743523828356733, "step": 2098, "train/total_loss": 0.16908812522888184 }, { "entropy": 9.371561050415039, "epoch": 0.20753411113308284, "mean_token_accuracy": 0.751968502998352, "num_tokens": 11535792.0, "step": 2099, "train/ce_loss": 0.6509340405464172 }, { "epoch": 0.20753411113308284, "step": 2099, "train/sim_loss": 0.07421875 }, { "epoch": 0.20753411113308284, "step": 2099, "train/total_loss": 0.13931214809417725 }, { "epoch": 0.20763298398259838, "grad_norm": 0.8685257434844971, "learning_rate": 9.483508876032242e-06, "loss": 0.1492, "step": 2100 }, { "entropy": 8.841520309448242, "epoch": 0.20763298398259838, "mean_token_accuracy": 0.7928118109703064, "num_tokens": 11541677.0, "step": 2100, "train/ce_loss": 0.5759143233299255 }, { "epoch": 0.20763298398259838, "step": 2100, "train/sim_loss": 0.046875 }, { "epoch": 0.20763298398259838, "step": 2100, "train/total_loss": 0.10446643829345703 }, { "entropy": 9.006629943847656, "epoch": 0.2077318568321139, "mean_token_accuracy": 0.7171464562416077, "num_tokens": 11547144.0, "step": 2101, "train/ce_loss": 1.0054140090942383 }, { "epoch": 0.2077318568321139, "step": 2101, "train/sim_loss": 0.1328125 }, { "epoch": 0.2077318568321139, "step": 2101, "train/total_loss": 0.23335391283035278 }, { "entropy": 9.19842529296875, "epoch": 0.2078307296816294, "mean_token_accuracy": 0.7816216349601746, "num_tokens": 11552722.0, "step": 2102, "train/ce_loss": 0.8183422088623047 }, { "epoch": 0.2078307296816294, "step": 2102, "train/sim_loss": 0.078125 }, { "epoch": 0.2078307296816294, "step": 2102, "train/total_loss": 0.15995922684669495 }, { "entropy": 9.32013988494873, "epoch": 0.20792960253114495, "mean_token_accuracy": 0.720818281173706, "num_tokens": 11558092.0, "step": 2103, "train/ce_loss": 1.6500827074050903 }, { "epoch": 0.20792960253114495, "step": 2103, "train/sim_loss": 0.08203125 }, { "epoch": 0.20792960253114495, "step": 2103, "train/total_loss": 0.2470395267009735 }, { "entropy": 9.592901229858398, "epoch": 0.20802847538066047, "mean_token_accuracy": 0.7601625919342041, "num_tokens": 11563367.0, "step": 2104, "train/ce_loss": 0.6019082069396973 }, { "epoch": 0.20802847538066047, "step": 2104, "train/sim_loss": 0.05859375 }, { "epoch": 0.20802847538066047, "step": 2104, "train/total_loss": 0.1187845766544342 }, { "entropy": 9.526144027709961, "epoch": 0.208127348230176, "mean_token_accuracy": 0.7392510175704956, "num_tokens": 11568886.0, "step": 2105, "train/ce_loss": 1.0078984498977661 }, { "epoch": 0.208127348230176, "step": 2105, "train/sim_loss": 0.09375 }, { "epoch": 0.208127348230176, "step": 2105, "train/total_loss": 0.1945398449897766 }, { "entropy": 9.020626068115234, "epoch": 0.20822622107969152, "mean_token_accuracy": 0.675273060798645, "num_tokens": 11574549.0, "step": 2106, "train/ce_loss": 1.9288792610168457 }, { "epoch": 0.20822622107969152, "step": 2106, "train/sim_loss": 0.0859375 }, { "epoch": 0.20822622107969152, "step": 2106, "train/total_loss": 0.27882543206214905 }, { "entropy": 9.515058517456055, "epoch": 0.20832509392920703, "mean_token_accuracy": 0.7262997031211853, "num_tokens": 11579854.0, "step": 2107, "train/ce_loss": 0.5892929434776306 }, { "epoch": 0.20832509392920703, "step": 2107, "train/sim_loss": 0.05859375 }, { "epoch": 0.20832509392920703, "step": 2107, "train/total_loss": 0.11752304434776306 }, { "entropy": 9.345060348510742, "epoch": 0.20842396677872257, "mean_token_accuracy": 0.7153196334838867, "num_tokens": 11585314.0, "step": 2108, "train/ce_loss": 1.1945141553878784 }, { "epoch": 0.20842396677872257, "step": 2108, "train/sim_loss": 0.046875 }, { "epoch": 0.20842396677872257, "step": 2108, "train/total_loss": 0.16632641851902008 }, { "entropy": 9.259843826293945, "epoch": 0.2085228396282381, "mean_token_accuracy": 0.7359116077423096, "num_tokens": 11590812.0, "step": 2109, "train/ce_loss": 1.0837774276733398 }, { "epoch": 0.2085228396282381, "step": 2109, "train/sim_loss": 0.09375 }, { "epoch": 0.2085228396282381, "step": 2109, "train/total_loss": 0.20212775468826294 }, { "entropy": 8.866198539733887, "epoch": 0.2086217124777536, "mean_token_accuracy": 0.7292020320892334, "num_tokens": 11596551.0, "step": 2110, "train/ce_loss": 0.37230604887008667 }, { "epoch": 0.2086217124777536, "step": 2110, "train/sim_loss": 0.05859375 }, { "epoch": 0.2086217124777536, "step": 2110, "train/total_loss": 0.09582436084747314 }, { "entropy": 9.546430587768555, "epoch": 0.20872058532726914, "mean_token_accuracy": 0.7461645603179932, "num_tokens": 11601847.0, "step": 2111, "train/ce_loss": 0.6731482744216919 }, { "epoch": 0.20872058532726914, "step": 2111, "train/sim_loss": 0.08203125 }, { "epoch": 0.20872058532726914, "step": 2111, "train/total_loss": 0.14934608340263367 }, { "entropy": 8.947864532470703, "epoch": 0.20881945817678466, "mean_token_accuracy": 0.8025169372558594, "num_tokens": 11607566.0, "step": 2112, "train/ce_loss": 0.7110840082168579 }, { "epoch": 0.20881945817678466, "step": 2112, "train/sim_loss": 0.0546875 }, { "epoch": 0.20881945817678466, "step": 2112, "train/total_loss": 0.1257959008216858 }, { "entropy": 9.357686042785645, "epoch": 0.20891833102630017, "mean_token_accuracy": 0.7212543487548828, "num_tokens": 11613102.0, "step": 2113, "train/ce_loss": 0.952403724193573 }, { "epoch": 0.20891833102630017, "step": 2113, "train/sim_loss": 0.09765625 }, { "epoch": 0.20891833102630017, "step": 2113, "train/total_loss": 0.19289663434028625 }, { "entropy": 8.916419982910156, "epoch": 0.2090172038758157, "mean_token_accuracy": 0.7374657988548279, "num_tokens": 11618749.0, "step": 2114, "train/ce_loss": 0.38604292273521423 }, { "epoch": 0.2090172038758157, "step": 2114, "train/sim_loss": 0.0546875 }, { "epoch": 0.2090172038758157, "step": 2114, "train/total_loss": 0.09329178929328918 }, { "entropy": 9.266845703125, "epoch": 0.20911607672533122, "mean_token_accuracy": 0.781440794467926, "num_tokens": 11624221.0, "step": 2115, "train/ce_loss": 0.6439337134361267 }, { "epoch": 0.20911607672533122, "step": 2115, "train/sim_loss": 0.0390625 }, { "epoch": 0.20911607672533122, "step": 2115, "train/total_loss": 0.10345587134361267 }, { "entropy": 9.439970016479492, "epoch": 0.20921494957484674, "mean_token_accuracy": 0.7262499928474426, "num_tokens": 11629546.0, "step": 2116, "train/ce_loss": 0.8910729885101318 }, { "epoch": 0.20921494957484674, "step": 2116, "train/sim_loss": 0.05859375 }, { "epoch": 0.20921494957484674, "step": 2116, "train/total_loss": 0.14770105481147766 }, { "entropy": 9.118417739868164, "epoch": 0.20931382242436228, "mean_token_accuracy": 0.7053333520889282, "num_tokens": 11634906.0, "step": 2117, "train/ce_loss": 0.8365302681922913 }, { "epoch": 0.20931382242436228, "step": 2117, "train/sim_loss": 0.046875 }, { "epoch": 0.20931382242436228, "step": 2117, "train/total_loss": 0.1305280327796936 }, { "entropy": 9.07475471496582, "epoch": 0.2094126952738778, "mean_token_accuracy": 0.7468671798706055, "num_tokens": 11640384.0, "step": 2118, "train/ce_loss": 0.9829981327056885 }, { "epoch": 0.2094126952738778, "step": 2118, "train/sim_loss": 0.125 }, { "epoch": 0.2094126952738778, "step": 2118, "train/total_loss": 0.2232998162508011 }, { "entropy": 9.518635749816895, "epoch": 0.2095115681233933, "mean_token_accuracy": 0.7550724744796753, "num_tokens": 11645701.0, "step": 2119, "train/ce_loss": 0.9550386071205139 }, { "epoch": 0.2095115681233933, "step": 2119, "train/sim_loss": 0.0703125 }, { "epoch": 0.2095115681233933, "step": 2119, "train/total_loss": 0.16581636667251587 }, { "epoch": 0.20961044097290885, "grad_norm": 0.805097222328186, "learning_rate": 9.478564011274292e-06, "loss": 0.1529, "step": 2120 }, { "entropy": 9.314053535461426, "epoch": 0.20961044097290885, "mean_token_accuracy": 0.739294707775116, "num_tokens": 11651127.0, "step": 2120, "train/ce_loss": 1.313176155090332 }, { "epoch": 0.20961044097290885, "step": 2120, "train/sim_loss": 0.06640625 }, { "epoch": 0.20961044097290885, "step": 2120, "train/total_loss": 0.1977238655090332 }, { "entropy": 9.54316234588623, "epoch": 0.20970931382242436, "mean_token_accuracy": 0.708776593208313, "num_tokens": 11656491.0, "step": 2121, "train/ce_loss": 0.7020536661148071 }, { "epoch": 0.20970931382242436, "step": 2121, "train/sim_loss": 0.0703125 }, { "epoch": 0.20970931382242436, "step": 2121, "train/total_loss": 0.14051786065101624 }, { "entropy": 9.363290786743164, "epoch": 0.20980818667193987, "mean_token_accuracy": 0.7278481125831604, "num_tokens": 11661940.0, "step": 2122, "train/ce_loss": 1.7179120779037476 }, { "epoch": 0.20980818667193987, "step": 2122, "train/sim_loss": 0.10546875 }, { "epoch": 0.20980818667193987, "step": 2122, "train/total_loss": 0.2772599458694458 }, { "entropy": 9.33470344543457, "epoch": 0.2099070595214554, "mean_token_accuracy": 0.7641395926475525, "num_tokens": 11667336.0, "step": 2123, "train/ce_loss": 0.9940215945243835 }, { "epoch": 0.2099070595214554, "step": 2123, "train/sim_loss": 0.01953125 }, { "epoch": 0.2099070595214554, "step": 2123, "train/total_loss": 0.11893340945243835 }, { "entropy": 9.123260498046875, "epoch": 0.21000593237097093, "mean_token_accuracy": 0.7672209143638611, "num_tokens": 11672784.0, "step": 2124, "train/ce_loss": 0.7837958335876465 }, { "epoch": 0.21000593237097093, "step": 2124, "train/sim_loss": 0.06640625 }, { "epoch": 0.21000593237097093, "step": 2124, "train/total_loss": 0.1447858363389969 }, { "entropy": 9.18411922454834, "epoch": 0.21010480522048647, "mean_token_accuracy": 0.7457817792892456, "num_tokens": 11678283.0, "step": 2125, "train/ce_loss": 0.7944912910461426 }, { "epoch": 0.21010480522048647, "step": 2125, "train/sim_loss": 0.1328125 }, { "epoch": 0.21010480522048647, "step": 2125, "train/total_loss": 0.2122616320848465 }, { "entropy": 9.4698486328125, "epoch": 0.21020367807000198, "mean_token_accuracy": 0.6689944267272949, "num_tokens": 11683645.0, "step": 2126, "train/ce_loss": 1.0283634662628174 }, { "epoch": 0.21020367807000198, "step": 2126, "train/sim_loss": 0.07421875 }, { "epoch": 0.21020367807000198, "step": 2126, "train/total_loss": 0.17705509066581726 }, { "entropy": 9.163232803344727, "epoch": 0.2103025509195175, "mean_token_accuracy": 0.7237512469291687, "num_tokens": 11689184.0, "step": 2127, "train/ce_loss": 1.1656492948532104 }, { "epoch": 0.2103025509195175, "step": 2127, "train/sim_loss": 0.109375 }, { "epoch": 0.2103025509195175, "step": 2127, "train/total_loss": 0.22593992948532104 }, { "entropy": 9.363082885742188, "epoch": 0.21040142376903304, "mean_token_accuracy": 0.7540574073791504, "num_tokens": 11694641.0, "step": 2128, "train/ce_loss": 1.066627025604248 }, { "epoch": 0.21040142376903304, "step": 2128, "train/sim_loss": 0.0859375 }, { "epoch": 0.21040142376903304, "step": 2128, "train/total_loss": 0.19260020554065704 }, { "entropy": 9.284854888916016, "epoch": 0.21050029661854855, "mean_token_accuracy": 0.706875741481781, "num_tokens": 11700078.0, "step": 2129, "train/ce_loss": 0.9174749255180359 }, { "epoch": 0.21050029661854855, "step": 2129, "train/sim_loss": 0.08984375 }, { "epoch": 0.21050029661854855, "step": 2129, "train/total_loss": 0.1815912425518036 }, { "entropy": 9.530131340026855, "epoch": 0.21059916946806406, "mean_token_accuracy": 0.7319148778915405, "num_tokens": 11705423.0, "step": 2130, "train/ce_loss": 1.0002235174179077 }, { "epoch": 0.21059916946806406, "step": 2130, "train/sim_loss": 0.10546875 }, { "epoch": 0.21059916946806406, "step": 2130, "train/total_loss": 0.2054910957813263 }, { "entropy": 9.669187545776367, "epoch": 0.2106980423175796, "mean_token_accuracy": 0.7316341996192932, "num_tokens": 11710686.0, "step": 2131, "train/ce_loss": 0.6858426332473755 }, { "epoch": 0.2106980423175796, "step": 2131, "train/sim_loss": 0.046875 }, { "epoch": 0.2106980423175796, "step": 2131, "train/total_loss": 0.11545926332473755 }, { "entropy": 9.437677383422852, "epoch": 0.21079691516709512, "mean_token_accuracy": 0.6938775777816772, "num_tokens": 11715960.0, "step": 2132, "train/ce_loss": 0.9249535202980042 }, { "epoch": 0.21079691516709512, "step": 2132, "train/sim_loss": 0.06640625 }, { "epoch": 0.21079691516709512, "step": 2132, "train/total_loss": 0.15890160202980042 }, { "entropy": 8.902722358703613, "epoch": 0.21089578801661063, "mean_token_accuracy": 0.6993927359580994, "num_tokens": 11721612.0, "step": 2133, "train/ce_loss": 0.9621187448501587 }, { "epoch": 0.21089578801661063, "step": 2133, "train/sim_loss": 0.0859375 }, { "epoch": 0.21089578801661063, "step": 2133, "train/total_loss": 0.18214938044548035 }, { "entropy": 9.05774211883545, "epoch": 0.21099466086612617, "mean_token_accuracy": 0.7642369270324707, "num_tokens": 11727141.0, "step": 2134, "train/ce_loss": 0.7873607277870178 }, { "epoch": 0.21099466086612617, "step": 2134, "train/sim_loss": 0.0390625 }, { "epoch": 0.21099466086612617, "step": 2134, "train/total_loss": 0.1177985742688179 }, { "entropy": 9.476606369018555, "epoch": 0.21109353371564168, "mean_token_accuracy": 0.7689075469970703, "num_tokens": 11732534.0, "step": 2135, "train/ce_loss": 0.7095643877983093 }, { "epoch": 0.21109353371564168, "step": 2135, "train/sim_loss": 0.0625 }, { "epoch": 0.21109353371564168, "step": 2135, "train/total_loss": 0.13345643877983093 }, { "entropy": 9.3292818069458, "epoch": 0.2111924065651572, "mean_token_accuracy": 0.7787356376647949, "num_tokens": 11737880.0, "step": 2136, "train/ce_loss": 0.6097721457481384 }, { "epoch": 0.2111924065651572, "step": 2136, "train/sim_loss": 0.07421875 }, { "epoch": 0.2111924065651572, "step": 2136, "train/total_loss": 0.13519597053527832 }, { "entropy": 8.897512435913086, "epoch": 0.21129127941467274, "mean_token_accuracy": 0.7393320798873901, "num_tokens": 11743481.0, "step": 2137, "train/ce_loss": 0.6047033667564392 }, { "epoch": 0.21129127941467274, "step": 2137, "train/sim_loss": 0.06640625 }, { "epoch": 0.21129127941467274, "step": 2137, "train/total_loss": 0.1268765926361084 }, { "entropy": 9.42276668548584, "epoch": 0.21139015226418825, "mean_token_accuracy": 0.7388688325881958, "num_tokens": 11748935.0, "step": 2138, "train/ce_loss": 1.1066912412643433 }, { "epoch": 0.21139015226418825, "step": 2138, "train/sim_loss": 0.07421875 }, { "epoch": 0.21139015226418825, "step": 2138, "train/total_loss": 0.18488788604736328 }, { "entropy": 9.066783905029297, "epoch": 0.21148902511370377, "mean_token_accuracy": 0.6676587462425232, "num_tokens": 11754517.0, "step": 2139, "train/ce_loss": 0.6702582240104675 }, { "epoch": 0.21148902511370377, "step": 2139, "train/sim_loss": 0.06640625 }, { "epoch": 0.21148902511370377, "step": 2139, "train/total_loss": 0.133432075381279 }, { "epoch": 0.2115878979632193, "grad_norm": 0.8862593770027161, "learning_rate": 9.473619146516345e-06, "loss": 0.1633, "step": 2140 }, { "entropy": 9.156086921691895, "epoch": 0.2115878979632193, "mean_token_accuracy": 0.7072165012359619, "num_tokens": 11760050.0, "step": 2140, "train/ce_loss": 1.2812256813049316 }, { "epoch": 0.2115878979632193, "step": 2140, "train/sim_loss": 0.078125 }, { "epoch": 0.2115878979632193, "step": 2140, "train/total_loss": 0.20624756813049316 }, { "entropy": 9.266983985900879, "epoch": 0.21168677081273482, "mean_token_accuracy": 0.7823204398155212, "num_tokens": 11765439.0, "step": 2141, "train/ce_loss": 0.6262449622154236 }, { "epoch": 0.21168677081273482, "step": 2141, "train/sim_loss": 0.0703125 }, { "epoch": 0.21168677081273482, "step": 2141, "train/total_loss": 0.1329369992017746 }, { "entropy": 9.012213706970215, "epoch": 0.21178564366225033, "mean_token_accuracy": 0.7664233446121216, "num_tokens": 11771020.0, "step": 2142, "train/ce_loss": 0.7013556361198425 }, { "epoch": 0.21178564366225033, "step": 2142, "train/sim_loss": 0.09765625 }, { "epoch": 0.21178564366225033, "step": 2142, "train/total_loss": 0.16779181361198425 }, { "entropy": 9.47337532043457, "epoch": 0.21188451651176587, "mean_token_accuracy": 0.7306733131408691, "num_tokens": 11776392.0, "step": 2143, "train/ce_loss": 1.0597143173217773 }, { "epoch": 0.21188451651176587, "step": 2143, "train/sim_loss": 0.078125 }, { "epoch": 0.21188451651176587, "step": 2143, "train/total_loss": 0.18409642577171326 }, { "entropy": 8.996399879455566, "epoch": 0.2119833893612814, "mean_token_accuracy": 0.7135778069496155, "num_tokens": 11782001.0, "step": 2144, "train/ce_loss": 0.6301249861717224 }, { "epoch": 0.2119833893612814, "step": 2144, "train/sim_loss": 0.0703125 }, { "epoch": 0.2119833893612814, "step": 2144, "train/total_loss": 0.1333250105381012 }, { "entropy": 9.0086088180542, "epoch": 0.2120822622107969, "mean_token_accuracy": 0.728215754032135, "num_tokens": 11787618.0, "step": 2145, "train/ce_loss": 0.985085129737854 }, { "epoch": 0.2120822622107969, "step": 2145, "train/sim_loss": 0.08984375 }, { "epoch": 0.2120822622107969, "step": 2145, "train/total_loss": 0.18835225701332092 }, { "entropy": 8.92747688293457, "epoch": 0.21218113506031244, "mean_token_accuracy": 0.8088512420654297, "num_tokens": 11793289.0, "step": 2146, "train/ce_loss": 0.37881946563720703 }, { "epoch": 0.21218113506031244, "step": 2146, "train/sim_loss": 0.02734375 }, { "epoch": 0.21218113506031244, "step": 2146, "train/total_loss": 0.06522569805383682 }, { "entropy": 9.216766357421875, "epoch": 0.21228000790982796, "mean_token_accuracy": 0.7375978827476501, "num_tokens": 11798706.0, "step": 2147, "train/ce_loss": 0.4602212607860565 }, { "epoch": 0.21228000790982796, "step": 2147, "train/sim_loss": 0.07421875 }, { "epoch": 0.21228000790982796, "step": 2147, "train/total_loss": 0.12024088203907013 }, { "entropy": 8.91801643371582, "epoch": 0.2123788807593435, "mean_token_accuracy": 0.762876570224762, "num_tokens": 11804395.0, "step": 2148, "train/ce_loss": 0.4570021331310272 }, { "epoch": 0.2123788807593435, "step": 2148, "train/sim_loss": 0.046875 }, { "epoch": 0.2123788807593435, "step": 2148, "train/total_loss": 0.09257521480321884 }, { "entropy": 9.05521011352539, "epoch": 0.212477753608859, "mean_token_accuracy": 0.7379862666130066, "num_tokens": 11809980.0, "step": 2149, "train/ce_loss": 0.6074749231338501 }, { "epoch": 0.212477753608859, "step": 2149, "train/sim_loss": 0.0703125 }, { "epoch": 0.212477753608859, "step": 2149, "train/total_loss": 0.13105998933315277 }, { "entropy": 9.264250755310059, "epoch": 0.21257662645837452, "mean_token_accuracy": 0.7559523582458496, "num_tokens": 11815481.0, "step": 2150, "train/ce_loss": 0.6527594923973083 }, { "epoch": 0.21257662645837452, "step": 2150, "train/sim_loss": 0.08984375 }, { "epoch": 0.21257662645837452, "step": 2150, "train/total_loss": 0.15511970221996307 }, { "entropy": 8.964212417602539, "epoch": 0.21267549930789006, "mean_token_accuracy": 0.6850152611732483, "num_tokens": 11821099.0, "step": 2151, "train/ce_loss": 1.2764462232589722 }, { "epoch": 0.21267549930789006, "step": 2151, "train/sim_loss": 0.06640625 }, { "epoch": 0.21267549930789006, "step": 2151, "train/total_loss": 0.1940508782863617 }, { "entropy": 9.30479907989502, "epoch": 0.21277437215740558, "mean_token_accuracy": 0.7472661137580872, "num_tokens": 11826511.0, "step": 2152, "train/ce_loss": 0.7031465172767639 }, { "epoch": 0.21277437215740558, "step": 2152, "train/sim_loss": 0.0546875 }, { "epoch": 0.21277437215740558, "step": 2152, "train/total_loss": 0.12500214576721191 }, { "entropy": 9.50535774230957, "epoch": 0.2128732450069211, "mean_token_accuracy": 0.7585752010345459, "num_tokens": 11831825.0, "step": 2153, "train/ce_loss": 0.5763528347015381 }, { "epoch": 0.2128732450069211, "step": 2153, "train/sim_loss": 0.09375 }, { "epoch": 0.2128732450069211, "step": 2153, "train/total_loss": 0.15138527750968933 }, { "entropy": 9.097603797912598, "epoch": 0.21297211785643663, "mean_token_accuracy": 0.7327293157577515, "num_tokens": 11837302.0, "step": 2154, "train/ce_loss": 0.8242121338844299 }, { "epoch": 0.21297211785643663, "step": 2154, "train/sim_loss": 0.08203125 }, { "epoch": 0.21297211785643663, "step": 2154, "train/total_loss": 0.164452463388443 }, { "entropy": 8.932534217834473, "epoch": 0.21307099070595215, "mean_token_accuracy": 0.7262905240058899, "num_tokens": 11842785.0, "step": 2155, "train/ce_loss": 1.1495753526687622 }, { "epoch": 0.21307099070595215, "step": 2155, "train/sim_loss": 0.1328125 }, { "epoch": 0.21307099070595215, "step": 2155, "train/total_loss": 0.2477700412273407 }, { "entropy": 9.401849746704102, "epoch": 0.21316986355546766, "mean_token_accuracy": 0.7553763389587402, "num_tokens": 11848308.0, "step": 2156, "train/ce_loss": 0.8175883293151855 }, { "epoch": 0.21316986355546766, "step": 2156, "train/sim_loss": 0.0546875 }, { "epoch": 0.21316986355546766, "step": 2156, "train/total_loss": 0.13644632697105408 }, { "entropy": 9.182430267333984, "epoch": 0.2132687364049832, "mean_token_accuracy": 0.7295514345169067, "num_tokens": 11853627.0, "step": 2157, "train/ce_loss": 0.8647571802139282 }, { "epoch": 0.2132687364049832, "step": 2157, "train/sim_loss": 0.09765625 }, { "epoch": 0.2132687364049832, "step": 2157, "train/total_loss": 0.18413197994232178 }, { "entropy": 9.231685638427734, "epoch": 0.2133676092544987, "mean_token_accuracy": 0.7541841268539429, "num_tokens": 11859183.0, "step": 2158, "train/ce_loss": 0.6478759050369263 }, { "epoch": 0.2133676092544987, "step": 2158, "train/sim_loss": 0.078125 }, { "epoch": 0.2133676092544987, "step": 2158, "train/total_loss": 0.1429125964641571 }, { "entropy": 8.934823036193848, "epoch": 0.21346648210401423, "mean_token_accuracy": 0.7284644246101379, "num_tokens": 11864840.0, "step": 2159, "train/ce_loss": 0.3597141206264496 }, { "epoch": 0.21346648210401423, "step": 2159, "train/sim_loss": 0.06640625 }, { "epoch": 0.21346648210401423, "step": 2159, "train/total_loss": 0.10237766802310944 }, { "epoch": 0.21356535495352977, "grad_norm": 0.7906326055526733, "learning_rate": 9.468674281758395e-06, "loss": 0.1551, "step": 2160 }, { "entropy": 9.331184387207031, "epoch": 0.21356535495352977, "mean_token_accuracy": 0.6482353210449219, "num_tokens": 11870323.0, "step": 2160, "train/ce_loss": 1.0788098573684692 }, { "epoch": 0.21356535495352977, "step": 2160, "train/sim_loss": 0.046875 }, { "epoch": 0.21356535495352977, "step": 2160, "train/total_loss": 0.15475597977638245 }, { "entropy": 9.248575210571289, "epoch": 0.21366422780304528, "mean_token_accuracy": 0.7735602259635925, "num_tokens": 11875651.0, "step": 2161, "train/ce_loss": 0.7537166476249695 }, { "epoch": 0.21366422780304528, "step": 2161, "train/sim_loss": 0.0703125 }, { "epoch": 0.21366422780304528, "step": 2161, "train/total_loss": 0.1456841677427292 }, { "entropy": 9.447821617126465, "epoch": 0.2137631006525608, "mean_token_accuracy": 0.7861035466194153, "num_tokens": 11881019.0, "step": 2162, "train/ce_loss": 0.39820966124534607 }, { "epoch": 0.2137631006525608, "step": 2162, "train/sim_loss": 0.078125 }, { "epoch": 0.2137631006525608, "step": 2162, "train/total_loss": 0.11794596910476685 }, { "entropy": 8.847949981689453, "epoch": 0.21386197350207634, "mean_token_accuracy": 0.8157894611358643, "num_tokens": 11886700.0, "step": 2163, "train/ce_loss": 0.6036969423294067 }, { "epoch": 0.21386197350207634, "step": 2163, "train/sim_loss": 0.0546875 }, { "epoch": 0.21386197350207634, "step": 2163, "train/total_loss": 0.11505720019340515 }, { "entropy": 8.75810718536377, "epoch": 0.21396084635159185, "mean_token_accuracy": 0.7404505610466003, "num_tokens": 11892579.0, "step": 2164, "train/ce_loss": 1.0418299436569214 }, { "epoch": 0.21396084635159185, "step": 2164, "train/sim_loss": 0.13671875 }, { "epoch": 0.21396084635159185, "step": 2164, "train/total_loss": 0.24090173840522766 }, { "entropy": 9.373746871948242, "epoch": 0.21405971920110736, "mean_token_accuracy": 0.7404255270957947, "num_tokens": 11897848.0, "step": 2165, "train/ce_loss": 0.4881204664707184 }, { "epoch": 0.21405971920110736, "step": 2165, "train/sim_loss": 0.078125 }, { "epoch": 0.21405971920110736, "step": 2165, "train/total_loss": 0.12693704664707184 }, { "entropy": 9.083601951599121, "epoch": 0.2141585920506229, "mean_token_accuracy": 0.8101660013198853, "num_tokens": 11903394.0, "step": 2166, "train/ce_loss": 0.8150098919868469 }, { "epoch": 0.2141585920506229, "step": 2166, "train/sim_loss": 0.03125 }, { "epoch": 0.2141585920506229, "step": 2166, "train/total_loss": 0.11275099217891693 }, { "entropy": 9.099020957946777, "epoch": 0.21425746490013842, "mean_token_accuracy": 0.696789562702179, "num_tokens": 11908906.0, "step": 2167, "train/ce_loss": 0.7952373027801514 }, { "epoch": 0.21425746490013842, "step": 2167, "train/sim_loss": 0.0625 }, { "epoch": 0.21425746490013842, "step": 2167, "train/total_loss": 0.1420237421989441 }, { "entropy": 9.284154891967773, "epoch": 0.21435633774965396, "mean_token_accuracy": 0.6962332725524902, "num_tokens": 11914364.0, "step": 2168, "train/ce_loss": 1.4024072885513306 }, { "epoch": 0.21435633774965396, "step": 2168, "train/sim_loss": 0.078125 }, { "epoch": 0.21435633774965396, "step": 2168, "train/total_loss": 0.21836572885513306 }, { "entropy": 8.808926582336426, "epoch": 0.21445521059916947, "mean_token_accuracy": 0.717597484588623, "num_tokens": 11919924.0, "step": 2169, "train/ce_loss": 0.7502585649490356 }, { "epoch": 0.21445521059916947, "step": 2169, "train/sim_loss": 0.0546875 }, { "epoch": 0.21445521059916947, "step": 2169, "train/total_loss": 0.12971335649490356 }, { "entropy": 9.072853088378906, "epoch": 0.21455408344868498, "mean_token_accuracy": 0.762691855430603, "num_tokens": 11925412.0, "step": 2170, "train/ce_loss": 0.6226568222045898 }, { "epoch": 0.21455408344868498, "step": 2170, "train/sim_loss": 0.0234375 }, { "epoch": 0.21455408344868498, "step": 2170, "train/total_loss": 0.08570317924022675 }, { "entropy": 8.988180160522461, "epoch": 0.21465295629820053, "mean_token_accuracy": 0.8169868588447571, "num_tokens": 11930972.0, "step": 2171, "train/ce_loss": 0.3053205907344818 }, { "epoch": 0.21465295629820053, "step": 2171, "train/sim_loss": 0.0625 }, { "epoch": 0.21465295629820053, "step": 2171, "train/total_loss": 0.09303206205368042 }, { "entropy": 8.954286575317383, "epoch": 0.21475182914771604, "mean_token_accuracy": 0.7414893507957458, "num_tokens": 11936522.0, "step": 2172, "train/ce_loss": 0.6196666359901428 }, { "epoch": 0.21475182914771604, "step": 2172, "train/sim_loss": 0.0234375 }, { "epoch": 0.21475182914771604, "step": 2172, "train/total_loss": 0.0854041650891304 }, { "entropy": 9.525686264038086, "epoch": 0.21485070199723155, "mean_token_accuracy": 0.730711042881012, "num_tokens": 11941746.0, "step": 2173, "train/ce_loss": 0.7115749716758728 }, { "epoch": 0.21485070199723155, "step": 2173, "train/sim_loss": 0.046875 }, { "epoch": 0.21485070199723155, "step": 2173, "train/total_loss": 0.11803250014781952 }, { "entropy": 9.168397903442383, "epoch": 0.2149495748467471, "mean_token_accuracy": 0.7639046311378479, "num_tokens": 11947304.0, "step": 2174, "train/ce_loss": 0.46332696080207825 }, { "epoch": 0.2149495748467471, "step": 2174, "train/sim_loss": 0.05859375 }, { "epoch": 0.2149495748467471, "step": 2174, "train/total_loss": 0.1049264520406723 }, { "entropy": 9.141965866088867, "epoch": 0.2150484476962626, "mean_token_accuracy": 0.7082802653312683, "num_tokens": 11952692.0, "step": 2175, "train/ce_loss": 1.2847363948822021 }, { "epoch": 0.2150484476962626, "step": 2175, "train/sim_loss": 0.08203125 }, { "epoch": 0.2150484476962626, "step": 2175, "train/total_loss": 0.21050488948822021 }, { "entropy": 9.20834732055664, "epoch": 0.21514732054577812, "mean_token_accuracy": 0.769792914390564, "num_tokens": 11958036.0, "step": 2176, "train/ce_loss": 0.4686487317085266 }, { "epoch": 0.21514732054577812, "step": 2176, "train/sim_loss": 0.02734375 }, { "epoch": 0.21514732054577812, "step": 2176, "train/total_loss": 0.07420862466096878 }, { "entropy": 8.903284072875977, "epoch": 0.21524619339529366, "mean_token_accuracy": 0.6942909955978394, "num_tokens": 11963691.0, "step": 2177, "train/ce_loss": 2.082163095474243 }, { "epoch": 0.21524619339529366, "step": 2177, "train/sim_loss": 0.078125 }, { "epoch": 0.21524619339529366, "step": 2177, "train/total_loss": 0.2863413095474243 }, { "entropy": 9.39302921295166, "epoch": 0.21534506624480917, "mean_token_accuracy": 0.7723785042762756, "num_tokens": 11969027.0, "step": 2178, "train/ce_loss": 0.6231851577758789 }, { "epoch": 0.21534506624480917, "step": 2178, "train/sim_loss": 0.03125 }, { "epoch": 0.21534506624480917, "step": 2178, "train/total_loss": 0.09356851875782013 }, { "entropy": 8.976602554321289, "epoch": 0.2154439390943247, "mean_token_accuracy": 0.7532981634140015, "num_tokens": 11974401.0, "step": 2179, "train/ce_loss": 0.8224877715110779 }, { "epoch": 0.2154439390943247, "step": 2179, "train/sim_loss": 0.12109375 }, { "epoch": 0.2154439390943247, "step": 2179, "train/total_loss": 0.2033425271511078 }, { "epoch": 0.21554281194384023, "grad_norm": 0.944002091884613, "learning_rate": 9.463729417000446e-06, "loss": 0.1525, "step": 2180 }, { "entropy": 9.007086753845215, "epoch": 0.21554281194384023, "mean_token_accuracy": 0.7544842958450317, "num_tokens": 11979854.0, "step": 2180, "train/ce_loss": 1.2704429626464844 }, { "epoch": 0.21554281194384023, "step": 2180, "train/sim_loss": 0.05859375 }, { "epoch": 0.21554281194384023, "step": 2180, "train/total_loss": 0.18563805520534515 }, { "entropy": 9.325379371643066, "epoch": 0.21564168479335574, "mean_token_accuracy": 0.7086330652236938, "num_tokens": 11985268.0, "step": 2181, "train/ce_loss": 0.6756832003593445 }, { "epoch": 0.21564168479335574, "step": 2181, "train/sim_loss": 0.0859375 }, { "epoch": 0.21564168479335574, "step": 2181, "train/total_loss": 0.1535058319568634 }, { "entropy": 8.967180252075195, "epoch": 0.21574055764287126, "mean_token_accuracy": 0.7821782231330872, "num_tokens": 11990818.0, "step": 2182, "train/ce_loss": 0.7983847260475159 }, { "epoch": 0.21574055764287126, "step": 2182, "train/sim_loss": 0.08984375 }, { "epoch": 0.21574055764287126, "step": 2182, "train/total_loss": 0.16968223452568054 }, { "entropy": 8.95667839050293, "epoch": 0.2158394304923868, "mean_token_accuracy": 0.7826552391052246, "num_tokens": 11996567.0, "step": 2183, "train/ce_loss": 0.336506724357605 }, { "epoch": 0.2158394304923868, "step": 2183, "train/sim_loss": 0.03125 }, { "epoch": 0.2158394304923868, "step": 2183, "train/total_loss": 0.06490067392587662 }, { "entropy": 8.871912002563477, "epoch": 0.2159383033419023, "mean_token_accuracy": 0.7497446537017822, "num_tokens": 12002205.0, "step": 2184, "train/ce_loss": 0.8406050801277161 }, { "epoch": 0.2159383033419023, "step": 2184, "train/sim_loss": 0.05859375 }, { "epoch": 0.2159383033419023, "step": 2184, "train/total_loss": 0.14265426993370056 }, { "entropy": 9.151744842529297, "epoch": 0.21603717619141782, "mean_token_accuracy": 0.6995249390602112, "num_tokens": 12007722.0, "step": 2185, "train/ce_loss": 0.6841529011726379 }, { "epoch": 0.21603717619141782, "step": 2185, "train/sim_loss": 0.10546875 }, { "epoch": 0.21603717619141782, "step": 2185, "train/total_loss": 0.17388403415679932 }, { "entropy": 8.395662307739258, "epoch": 0.21613604904093336, "mean_token_accuracy": 0.7195600867271423, "num_tokens": 12013580.0, "step": 2186, "train/ce_loss": 0.43239256739616394 }, { "epoch": 0.21613604904093336, "step": 2186, "train/sim_loss": 0.1015625 }, { "epoch": 0.21613604904093336, "step": 2186, "train/total_loss": 0.1448017656803131 }, { "entropy": 9.172821998596191, "epoch": 0.21623492189044888, "mean_token_accuracy": 0.7525891661643982, "num_tokens": 12019013.0, "step": 2187, "train/ce_loss": 0.3812854290008545 }, { "epoch": 0.21623492189044888, "step": 2187, "train/sim_loss": 0.0234375 }, { "epoch": 0.21623492189044888, "step": 2187, "train/total_loss": 0.06156604364514351 }, { "entropy": 9.275386810302734, "epoch": 0.21633379473996442, "mean_token_accuracy": 0.7203065156936646, "num_tokens": 12024409.0, "step": 2188, "train/ce_loss": 1.0687146186828613 }, { "epoch": 0.21633379473996442, "step": 2188, "train/sim_loss": 0.078125 }, { "epoch": 0.21633379473996442, "step": 2188, "train/total_loss": 0.18499645590782166 }, { "entropy": 9.346619606018066, "epoch": 0.21643266758947993, "mean_token_accuracy": 0.7711442708969116, "num_tokens": 12030007.0, "step": 2189, "train/ce_loss": 0.7600885033607483 }, { "epoch": 0.21643266758947993, "step": 2189, "train/sim_loss": 0.08203125 }, { "epoch": 0.21643266758947993, "step": 2189, "train/total_loss": 0.1580401062965393 }, { "entropy": 9.123140335083008, "epoch": 0.21653154043899545, "mean_token_accuracy": 0.7804877758026123, "num_tokens": 12035396.0, "step": 2190, "train/ce_loss": 0.3369859755039215 }, { "epoch": 0.21653154043899545, "step": 2190, "train/sim_loss": 0.03125 }, { "epoch": 0.21653154043899545, "step": 2190, "train/total_loss": 0.06494860351085663 }, { "entropy": 8.956740379333496, "epoch": 0.216630413288511, "mean_token_accuracy": 0.7424547076225281, "num_tokens": 12040917.0, "step": 2191, "train/ce_loss": 1.2359898090362549 }, { "epoch": 0.216630413288511, "step": 2191, "train/sim_loss": 0.109375 }, { "epoch": 0.216630413288511, "step": 2191, "train/total_loss": 0.23297399282455444 }, { "entropy": 8.583629608154297, "epoch": 0.2167292861380265, "mean_token_accuracy": 0.708777666091919, "num_tokens": 12046688.0, "step": 2192, "train/ce_loss": 1.010187029838562 }, { "epoch": 0.2167292861380265, "step": 2192, "train/sim_loss": 0.078125 }, { "epoch": 0.2167292861380265, "step": 2192, "train/total_loss": 0.17914369702339172 }, { "entropy": 8.826120376586914, "epoch": 0.216828158987542, "mean_token_accuracy": 0.7598385214805603, "num_tokens": 12052352.0, "step": 2193, "train/ce_loss": 0.7662855386734009 }, { "epoch": 0.216828158987542, "step": 2193, "train/sim_loss": 0.06640625 }, { "epoch": 0.216828158987542, "step": 2193, "train/total_loss": 0.14303481578826904 }, { "entropy": 8.987380981445312, "epoch": 0.21692703183705755, "mean_token_accuracy": 0.7153024673461914, "num_tokens": 12057835.0, "step": 2194, "train/ce_loss": 0.730076253414154 }, { "epoch": 0.21692703183705755, "step": 2194, "train/sim_loss": 0.1484375 }, { "epoch": 0.21692703183705755, "step": 2194, "train/total_loss": 0.22144512832164764 }, { "entropy": 9.247431755065918, "epoch": 0.21702590468657307, "mean_token_accuracy": 0.7256410121917725, "num_tokens": 12063209.0, "step": 2195, "train/ce_loss": 0.7511898875236511 }, { "epoch": 0.21702590468657307, "step": 2195, "train/sim_loss": 0.05078125 }, { "epoch": 0.21702590468657307, "step": 2195, "train/total_loss": 0.1259002387523651 }, { "entropy": 9.134023666381836, "epoch": 0.21712477753608858, "mean_token_accuracy": 0.7285902500152588, "num_tokens": 12068591.0, "step": 2196, "train/ce_loss": 0.3892326056957245 }, { "epoch": 0.21712477753608858, "step": 2196, "train/sim_loss": 0.07421875 }, { "epoch": 0.21712477753608858, "step": 2196, "train/total_loss": 0.11314201354980469 }, { "entropy": 9.166153907775879, "epoch": 0.21722365038560412, "mean_token_accuracy": 0.7228915691375732, "num_tokens": 12073871.0, "step": 2197, "train/ce_loss": 1.0192383527755737 }, { "epoch": 0.21722365038560412, "step": 2197, "train/sim_loss": 0.06640625 }, { "epoch": 0.21722365038560412, "step": 2197, "train/total_loss": 0.1683300882577896 }, { "entropy": 8.506303787231445, "epoch": 0.21732252323511964, "mean_token_accuracy": 0.7549880146980286, "num_tokens": 12079664.0, "step": 2198, "train/ce_loss": 0.4574704170227051 }, { "epoch": 0.21732252323511964, "step": 2198, "train/sim_loss": 0.05859375 }, { "epoch": 0.21732252323511964, "step": 2198, "train/total_loss": 0.10434079170227051 }, { "entropy": 9.098962783813477, "epoch": 0.21742139608463515, "mean_token_accuracy": 0.7684210538864136, "num_tokens": 12085123.0, "step": 2199, "train/ce_loss": 0.42653581500053406 }, { "epoch": 0.21742139608463515, "step": 2199, "train/sim_loss": 0.0859375 }, { "epoch": 0.21742139608463515, "step": 2199, "train/total_loss": 0.12859109044075012 }, { "epoch": 0.2175202689341507, "grad_norm": 0.8113583922386169, "learning_rate": 9.458784552242498e-06, "loss": 0.1509, "step": 2200 }, { "entropy": 8.989877700805664, "epoch": 0.2175202689341507, "mean_token_accuracy": 0.7445708513259888, "num_tokens": 12090749.0, "step": 2200, "train/ce_loss": 0.7782604694366455 }, { "epoch": 0.2175202689341507, "step": 2200, "train/sim_loss": 0.07421875 }, { "epoch": 0.2175202689341507, "step": 2200, "train/total_loss": 0.15204480290412903 }, { "entropy": 9.248990058898926, "epoch": 0.2176191417836662, "mean_token_accuracy": 0.7144653797149658, "num_tokens": 12096191.0, "step": 2201, "train/ce_loss": 0.4289693534374237 }, { "epoch": 0.2176191417836662, "step": 2201, "train/sim_loss": 0.0546875 }, { "epoch": 0.2176191417836662, "step": 2201, "train/total_loss": 0.09758444130420685 }, { "entropy": 8.640788078308105, "epoch": 0.21771801463318172, "mean_token_accuracy": 0.7598039507865906, "num_tokens": 12101847.0, "step": 2202, "train/ce_loss": 0.7135042548179626 }, { "epoch": 0.21771801463318172, "step": 2202, "train/sim_loss": 0.02734375 }, { "epoch": 0.21771801463318172, "step": 2202, "train/total_loss": 0.09869417548179626 }, { "entropy": 9.008526802062988, "epoch": 0.21781688748269726, "mean_token_accuracy": 0.7414578795433044, "num_tokens": 12107279.0, "step": 2203, "train/ce_loss": 0.9218137860298157 }, { "epoch": 0.21781688748269726, "step": 2203, "train/sim_loss": 0.09375 }, { "epoch": 0.21781688748269726, "step": 2203, "train/total_loss": 0.18593138456344604 }, { "entropy": 9.342008590698242, "epoch": 0.21791576033221277, "mean_token_accuracy": 0.7044887542724609, "num_tokens": 12112730.0, "step": 2204, "train/ce_loss": 0.6716903448104858 }, { "epoch": 0.21791576033221277, "step": 2204, "train/sim_loss": 0.0234375 }, { "epoch": 0.21791576033221277, "step": 2204, "train/total_loss": 0.09060653299093246 }, { "entropy": 9.115938186645508, "epoch": 0.21801463318172828, "mean_token_accuracy": 0.713032603263855, "num_tokens": 12118170.0, "step": 2205, "train/ce_loss": 0.6629267930984497 }, { "epoch": 0.21801463318172828, "step": 2205, "train/sim_loss": 0.07421875 }, { "epoch": 0.21801463318172828, "step": 2205, "train/total_loss": 0.1405114233493805 }, { "entropy": 9.212739944458008, "epoch": 0.21811350603124383, "mean_token_accuracy": 0.800000011920929, "num_tokens": 12123531.0, "step": 2206, "train/ce_loss": 0.46304237842559814 }, { "epoch": 0.21811350603124383, "step": 2206, "train/sim_loss": 0.04296875 }, { "epoch": 0.21811350603124383, "step": 2206, "train/total_loss": 0.08927299082279205 }, { "entropy": 8.830209732055664, "epoch": 0.21821237888075934, "mean_token_accuracy": 0.8359621167182922, "num_tokens": 12129125.0, "step": 2207, "train/ce_loss": 0.37546712160110474 }, { "epoch": 0.21821237888075934, "step": 2207, "train/sim_loss": 0.0234375 }, { "epoch": 0.21821237888075934, "step": 2207, "train/total_loss": 0.06098421290516853 }, { "entropy": 8.89437484741211, "epoch": 0.21831125173027488, "mean_token_accuracy": 0.7702702879905701, "num_tokens": 12134654.0, "step": 2208, "train/ce_loss": 0.8285568356513977 }, { "epoch": 0.21831125173027488, "step": 2208, "train/sim_loss": 0.046875 }, { "epoch": 0.21831125173027488, "step": 2208, "train/total_loss": 0.129730686545372 }, { "entropy": 9.501925468444824, "epoch": 0.2184101245797904, "mean_token_accuracy": 0.6934812664985657, "num_tokens": 12139962.0, "step": 2209, "train/ce_loss": 0.948724627494812 }, { "epoch": 0.2184101245797904, "step": 2209, "train/sim_loss": 0.1015625 }, { "epoch": 0.2184101245797904, "step": 2209, "train/total_loss": 0.19643497467041016 }, { "entropy": 9.41552734375, "epoch": 0.2185089974293059, "mean_token_accuracy": 0.7181817889213562, "num_tokens": 12145254.0, "step": 2210, "train/ce_loss": 0.573506236076355 }, { "epoch": 0.2185089974293059, "step": 2210, "train/sim_loss": 0.0625 }, { "epoch": 0.2185089974293059, "step": 2210, "train/total_loss": 0.11985062062740326 }, { "entropy": 9.019449234008789, "epoch": 0.21860787027882145, "mean_token_accuracy": 0.8136574029922485, "num_tokens": 12150757.0, "step": 2211, "train/ce_loss": 0.6066716313362122 }, { "epoch": 0.21860787027882145, "step": 2211, "train/sim_loss": 0.02734375 }, { "epoch": 0.21860787027882145, "step": 2211, "train/total_loss": 0.08801091462373734 }, { "entropy": 9.167232513427734, "epoch": 0.21870674312833696, "mean_token_accuracy": 0.7099879384040833, "num_tokens": 12156152.0, "step": 2212, "train/ce_loss": 1.151693344116211 }, { "epoch": 0.21870674312833696, "step": 2212, "train/sim_loss": 0.04296875 }, { "epoch": 0.21870674312833696, "step": 2212, "train/total_loss": 0.15813809633255005 }, { "entropy": 9.294600486755371, "epoch": 0.21880561597785247, "mean_token_accuracy": 0.7493734359741211, "num_tokens": 12161448.0, "step": 2213, "train/ce_loss": 0.5210123658180237 }, { "epoch": 0.21880561597785247, "step": 2213, "train/sim_loss": 0.05859375 }, { "epoch": 0.21880561597785247, "step": 2213, "train/total_loss": 0.1106949895620346 }, { "entropy": 8.814798355102539, "epoch": 0.21890448882736802, "mean_token_accuracy": 0.7829217910766602, "num_tokens": 12167088.0, "step": 2214, "train/ce_loss": 0.9555440545082092 }, { "epoch": 0.21890448882736802, "step": 2214, "train/sim_loss": 0.12109375 }, { "epoch": 0.21890448882736802, "step": 2214, "train/total_loss": 0.2166481614112854 }, { "entropy": 9.013601303100586, "epoch": 0.21900336167688353, "mean_token_accuracy": 0.7928802371025085, "num_tokens": 12172638.0, "step": 2215, "train/ce_loss": 0.8125816583633423 }, { "epoch": 0.21900336167688353, "step": 2215, "train/sim_loss": 0.0625 }, { "epoch": 0.21900336167688353, "step": 2215, "train/total_loss": 0.14375817775726318 }, { "entropy": 9.24868392944336, "epoch": 0.21910223452639904, "mean_token_accuracy": 0.6765463948249817, "num_tokens": 12178104.0, "step": 2216, "train/ce_loss": 1.0059248208999634 }, { "epoch": 0.21910223452639904, "step": 2216, "train/sim_loss": 0.078125 }, { "epoch": 0.21910223452639904, "step": 2216, "train/total_loss": 0.1787174940109253 }, { "entropy": 9.0341796875, "epoch": 0.21920110737591458, "mean_token_accuracy": 0.7532933950424194, "num_tokens": 12183543.0, "step": 2217, "train/ce_loss": 0.3666033148765564 }, { "epoch": 0.21920110737591458, "step": 2217, "train/sim_loss": 0.078125 }, { "epoch": 0.21920110737591458, "step": 2217, "train/total_loss": 0.1147853285074234 }, { "entropy": 8.920690536499023, "epoch": 0.2192999802254301, "mean_token_accuracy": 0.7126673460006714, "num_tokens": 12189149.0, "step": 2218, "train/ce_loss": 1.2705798149108887 }, { "epoch": 0.2192999802254301, "step": 2218, "train/sim_loss": 0.0859375 }, { "epoch": 0.2192999802254301, "step": 2218, "train/total_loss": 0.2129954844713211 }, { "entropy": 8.608755111694336, "epoch": 0.2193988530749456, "mean_token_accuracy": 0.7359597682952881, "num_tokens": 12194887.0, "step": 2219, "train/ce_loss": 0.9149896502494812 }, { "epoch": 0.2193988530749456, "step": 2219, "train/sim_loss": 0.0859375 }, { "epoch": 0.2193988530749456, "step": 2219, "train/total_loss": 0.1774364709854126 }, { "epoch": 0.21949772592446115, "grad_norm": 0.8615835905075073, "learning_rate": 9.453839687484548e-06, "loss": 0.1491, "step": 2220 }, { "entropy": 9.222993850708008, "epoch": 0.21949772592446115, "mean_token_accuracy": 0.759149968624115, "num_tokens": 12200419.0, "step": 2220, "train/ce_loss": 0.9884011745452881 }, { "epoch": 0.21949772592446115, "step": 2220, "train/sim_loss": 0.0625 }, { "epoch": 0.21949772592446115, "step": 2220, "train/total_loss": 0.1613401174545288 }, { "entropy": 8.720367431640625, "epoch": 0.21959659877397666, "mean_token_accuracy": 0.694059431552887, "num_tokens": 12205974.0, "step": 2221, "train/ce_loss": 0.570675790309906 }, { "epoch": 0.21959659877397666, "step": 2221, "train/sim_loss": 0.06640625 }, { "epoch": 0.21959659877397666, "step": 2221, "train/total_loss": 0.12347383052110672 }, { "entropy": 9.13248062133789, "epoch": 0.21969547162349218, "mean_token_accuracy": 0.7358276844024658, "num_tokens": 12211416.0, "step": 2222, "train/ce_loss": 0.7904714941978455 }, { "epoch": 0.21969547162349218, "step": 2222, "train/sim_loss": 0.078125 }, { "epoch": 0.21969547162349218, "step": 2222, "train/total_loss": 0.15717214345932007 }, { "entropy": 9.092660903930664, "epoch": 0.21979434447300772, "mean_token_accuracy": 0.713178277015686, "num_tokens": 12216965.0, "step": 2223, "train/ce_loss": 1.1144567728042603 }, { "epoch": 0.21979434447300772, "step": 2223, "train/sim_loss": 0.09375 }, { "epoch": 0.21979434447300772, "step": 2223, "train/total_loss": 0.20519568026065826 }, { "entropy": 9.307792663574219, "epoch": 0.21989321732252323, "mean_token_accuracy": 0.7560663819313049, "num_tokens": 12222332.0, "step": 2224, "train/ce_loss": 0.8258614540100098 }, { "epoch": 0.21989321732252323, "step": 2224, "train/sim_loss": 0.07421875 }, { "epoch": 0.21989321732252323, "step": 2224, "train/total_loss": 0.1568048894405365 }, { "entropy": 8.301382064819336, "epoch": 0.21999209017203875, "mean_token_accuracy": 0.72635817527771, "num_tokens": 12228389.0, "step": 2225, "train/ce_loss": 1.5294477939605713 }, { "epoch": 0.21999209017203875, "step": 2225, "train/sim_loss": 0.09375 }, { "epoch": 0.21999209017203875, "step": 2225, "train/total_loss": 0.24669478833675385 }, { "entropy": 9.212616920471191, "epoch": 0.2200909630215543, "mean_token_accuracy": 0.7202970385551453, "num_tokens": 12233850.0, "step": 2226, "train/ce_loss": 0.6055426001548767 }, { "epoch": 0.2200909630215543, "step": 2226, "train/sim_loss": 0.07421875 }, { "epoch": 0.2200909630215543, "step": 2226, "train/total_loss": 0.13477301597595215 }, { "entropy": 8.960829734802246, "epoch": 0.2201898358710698, "mean_token_accuracy": 0.7800429463386536, "num_tokens": 12239460.0, "step": 2227, "train/ce_loss": 0.506638765335083 }, { "epoch": 0.2201898358710698, "step": 2227, "train/sim_loss": 0.03515625 }, { "epoch": 0.2201898358710698, "step": 2227, "train/total_loss": 0.08582012355327606 }, { "entropy": 9.353612899780273, "epoch": 0.22028870872058534, "mean_token_accuracy": 0.7035971283912659, "num_tokens": 12244825.0, "step": 2228, "train/ce_loss": 0.8839163184165955 }, { "epoch": 0.22028870872058534, "step": 2228, "train/sim_loss": 0.0703125 }, { "epoch": 0.22028870872058534, "step": 2228, "train/total_loss": 0.15870413184165955 }, { "entropy": 9.24436092376709, "epoch": 0.22038758157010085, "mean_token_accuracy": 0.7378190159797668, "num_tokens": 12250287.0, "step": 2229, "train/ce_loss": 1.0176256895065308 }, { "epoch": 0.22038758157010085, "step": 2229, "train/sim_loss": 0.0625 }, { "epoch": 0.22038758157010085, "step": 2229, "train/total_loss": 0.1642625629901886 }, { "entropy": 8.61416244506836, "epoch": 0.22048645441961637, "mean_token_accuracy": 0.6672144532203674, "num_tokens": 12256008.0, "step": 2230, "train/ce_loss": 2.2672054767608643 }, { "epoch": 0.22048645441961637, "step": 2230, "train/sim_loss": 0.04296875 }, { "epoch": 0.22048645441961637, "step": 2230, "train/total_loss": 0.26968932151794434 }, { "entropy": 8.833093643188477, "epoch": 0.2205853272691319, "mean_token_accuracy": 0.7733333110809326, "num_tokens": 12261571.0, "step": 2231, "train/ce_loss": 0.4599898159503937 }, { "epoch": 0.2205853272691319, "step": 2231, "train/sim_loss": 0.02734375 }, { "epoch": 0.2205853272691319, "step": 2231, "train/total_loss": 0.07334273308515549 }, { "entropy": 8.955930709838867, "epoch": 0.22068420011864742, "mean_token_accuracy": 0.7754279971122742, "num_tokens": 12267140.0, "step": 2232, "train/ce_loss": 0.6422550678253174 }, { "epoch": 0.22068420011864742, "step": 2232, "train/sim_loss": 0.02734375 }, { "epoch": 0.22068420011864742, "step": 2232, "train/total_loss": 0.09156925976276398 }, { "entropy": 9.115323066711426, "epoch": 0.22078307296816294, "mean_token_accuracy": 0.7469066381454468, "num_tokens": 12272676.0, "step": 2233, "train/ce_loss": 0.7064507007598877 }, { "epoch": 0.22078307296816294, "step": 2233, "train/sim_loss": 0.07421875 }, { "epoch": 0.22078307296816294, "step": 2233, "train/total_loss": 0.1448638141155243 }, { "entropy": 8.950870513916016, "epoch": 0.22088194581767848, "mean_token_accuracy": 0.7697228193283081, "num_tokens": 12278230.0, "step": 2234, "train/ce_loss": 0.43317025899887085 }, { "epoch": 0.22088194581767848, "step": 2234, "train/sim_loss": 0.02734375 }, { "epoch": 0.22088194581767848, "step": 2234, "train/total_loss": 0.0706607773900032 }, { "entropy": 9.124919891357422, "epoch": 0.220980818667194, "mean_token_accuracy": 0.7405063509941101, "num_tokens": 12283597.0, "step": 2235, "train/ce_loss": 0.6039170026779175 }, { "epoch": 0.220980818667194, "step": 2235, "train/sim_loss": 0.08984375 }, { "epoch": 0.220980818667194, "step": 2235, "train/total_loss": 0.15023544430732727 }, { "entropy": 9.259320259094238, "epoch": 0.2210796915167095, "mean_token_accuracy": 0.687417209148407, "num_tokens": 12288940.0, "step": 2236, "train/ce_loss": 0.8023043274879456 }, { "epoch": 0.2210796915167095, "step": 2236, "train/sim_loss": 0.09765625 }, { "epoch": 0.2210796915167095, "step": 2236, "train/total_loss": 0.1778866946697235 }, { "entropy": 9.476198196411133, "epoch": 0.22117856436622504, "mean_token_accuracy": 0.757617712020874, "num_tokens": 12294269.0, "step": 2237, "train/ce_loss": 0.8722690343856812 }, { "epoch": 0.22117856436622504, "step": 2237, "train/sim_loss": 0.03515625 }, { "epoch": 0.22117856436622504, "step": 2237, "train/total_loss": 0.12238315492868423 }, { "entropy": 9.003570556640625, "epoch": 0.22127743721574056, "mean_token_accuracy": 0.7047387361526489, "num_tokens": 12299703.0, "step": 2238, "train/ce_loss": 0.7757760882377625 }, { "epoch": 0.22127743721574056, "step": 2238, "train/sim_loss": 0.06640625 }, { "epoch": 0.22127743721574056, "step": 2238, "train/total_loss": 0.1439838707447052 }, { "entropy": 8.955242156982422, "epoch": 0.22137631006525607, "mean_token_accuracy": 0.7385759949684143, "num_tokens": 12305257.0, "step": 2239, "train/ce_loss": 0.8822928071022034 }, { "epoch": 0.22137631006525607, "step": 2239, "train/sim_loss": 0.078125 }, { "epoch": 0.22137631006525607, "step": 2239, "train/total_loss": 0.16635428369045258 }, { "epoch": 0.2214751829147716, "grad_norm": 0.9519903659820557, "learning_rate": 9.4488948227266e-06, "loss": 0.1549, "step": 2240 }, { "entropy": 9.274029731750488, "epoch": 0.2214751829147716, "mean_token_accuracy": 0.7161290049552917, "num_tokens": 12310677.0, "step": 2240, "train/ce_loss": 0.8535448312759399 }, { "epoch": 0.2214751829147716, "step": 2240, "train/sim_loss": 0.09765625 }, { "epoch": 0.2214751829147716, "step": 2240, "train/total_loss": 0.18301072716712952 }, { "entropy": 8.866333961486816, "epoch": 0.22157405576428713, "mean_token_accuracy": 0.761457085609436, "num_tokens": 12316174.0, "step": 2241, "train/ce_loss": 0.8339522480964661 }, { "epoch": 0.22157405576428713, "step": 2241, "train/sim_loss": 0.1015625 }, { "epoch": 0.22157405576428713, "step": 2241, "train/total_loss": 0.18495772778987885 }, { "entropy": 8.694230079650879, "epoch": 0.22167292861380264, "mean_token_accuracy": 0.6978984475135803, "num_tokens": 12321988.0, "step": 2242, "train/ce_loss": 0.4453448951244354 }, { "epoch": 0.22167292861380264, "step": 2242, "train/sim_loss": 0.03125 }, { "epoch": 0.22167292861380264, "step": 2242, "train/total_loss": 0.07578448951244354 }, { "entropy": 8.809189796447754, "epoch": 0.22177180146331818, "mean_token_accuracy": 0.7320125102996826, "num_tokens": 12327495.0, "step": 2243, "train/ce_loss": 0.9564871788024902 }, { "epoch": 0.22177180146331818, "step": 2243, "train/sim_loss": 0.06640625 }, { "epoch": 0.22177180146331818, "step": 2243, "train/total_loss": 0.16205497086048126 }, { "entropy": 8.389833450317383, "epoch": 0.2218706743128337, "mean_token_accuracy": 0.7308642268180847, "num_tokens": 12333315.0, "step": 2244, "train/ce_loss": 0.37556397914886475 }, { "epoch": 0.2218706743128337, "step": 2244, "train/sim_loss": 0.0625 }, { "epoch": 0.2218706743128337, "step": 2244, "train/total_loss": 0.10005639493465424 }, { "entropy": 9.003235816955566, "epoch": 0.2219695471623492, "mean_token_accuracy": 0.8324205875396729, "num_tokens": 12338785.0, "step": 2245, "train/ce_loss": 0.4365626871585846 }, { "epoch": 0.2219695471623492, "step": 2245, "train/sim_loss": 0.08203125 }, { "epoch": 0.2219695471623492, "step": 2245, "train/total_loss": 0.12568752467632294 }, { "entropy": 9.045141220092773, "epoch": 0.22206842001186475, "mean_token_accuracy": 0.7394958138465881, "num_tokens": 12344276.0, "step": 2246, "train/ce_loss": 0.8114592432975769 }, { "epoch": 0.22206842001186475, "step": 2246, "train/sim_loss": 0.046875 }, { "epoch": 0.22206842001186475, "step": 2246, "train/total_loss": 0.12802092730998993 }, { "entropy": 8.777359008789062, "epoch": 0.22216729286138026, "mean_token_accuracy": 0.7748414278030396, "num_tokens": 12349865.0, "step": 2247, "train/ce_loss": 0.534114420413971 }, { "epoch": 0.22216729286138026, "step": 2247, "train/sim_loss": 0.03125 }, { "epoch": 0.22216729286138026, "step": 2247, "train/total_loss": 0.08466143906116486 }, { "entropy": 9.404213905334473, "epoch": 0.22226616571089577, "mean_token_accuracy": 0.7180192470550537, "num_tokens": 12355171.0, "step": 2248, "train/ce_loss": 0.7035179734230042 }, { "epoch": 0.22226616571089577, "step": 2248, "train/sim_loss": 0.08984375 }, { "epoch": 0.22226616571089577, "step": 2248, "train/total_loss": 0.16019555926322937 }, { "entropy": 9.178009986877441, "epoch": 0.22236503856041132, "mean_token_accuracy": 0.7316455841064453, "num_tokens": 12360616.0, "step": 2249, "train/ce_loss": 0.3326762020587921 }, { "epoch": 0.22236503856041132, "step": 2249, "train/sim_loss": 0.0546875 }, { "epoch": 0.22236503856041132, "step": 2249, "train/total_loss": 0.08795511722564697 }, { "entropy": 8.911588668823242, "epoch": 0.22246391140992683, "mean_token_accuracy": 0.8208501935005188, "num_tokens": 12366245.0, "step": 2250, "train/ce_loss": 1.0582396984100342 }, { "epoch": 0.22246391140992683, "step": 2250, "train/sim_loss": 0.08984375 }, { "epoch": 0.22246391140992683, "step": 2250, "train/total_loss": 0.19566771388053894 }, { "entropy": 8.827485084533691, "epoch": 0.22256278425944237, "mean_token_accuracy": 0.7243133187294006, "num_tokens": 12371841.0, "step": 2251, "train/ce_loss": 0.9243972301483154 }, { "epoch": 0.22256278425944237, "step": 2251, "train/sim_loss": 0.0390625 }, { "epoch": 0.22256278425944237, "step": 2251, "train/total_loss": 0.13150222599506378 }, { "entropy": 9.185680389404297, "epoch": 0.22266165710895788, "mean_token_accuracy": 0.7175480723381042, "num_tokens": 12377328.0, "step": 2252, "train/ce_loss": 0.9323478937149048 }, { "epoch": 0.22266165710895788, "step": 2252, "train/sim_loss": 0.06640625 }, { "epoch": 0.22266165710895788, "step": 2252, "train/total_loss": 0.15964104235172272 }, { "entropy": 9.014326095581055, "epoch": 0.2227605299584734, "mean_token_accuracy": 0.7018442749977112, "num_tokens": 12382916.0, "step": 2253, "train/ce_loss": 0.6566764712333679 }, { "epoch": 0.2227605299584734, "step": 2253, "train/sim_loss": 0.078125 }, { "epoch": 0.2227605299584734, "step": 2253, "train/total_loss": 0.14379265904426575 }, { "entropy": 8.927739143371582, "epoch": 0.22285940280798894, "mean_token_accuracy": 0.7481662631034851, "num_tokens": 12388353.0, "step": 2254, "train/ce_loss": 0.6740115880966187 }, { "epoch": 0.22285940280798894, "step": 2254, "train/sim_loss": 0.0390625 }, { "epoch": 0.22285940280798894, "step": 2254, "train/total_loss": 0.10646366328001022 }, { "entropy": 8.78125286102295, "epoch": 0.22295827565750445, "mean_token_accuracy": 0.7270364165306091, "num_tokens": 12394004.0, "step": 2255, "train/ce_loss": 0.7787653207778931 }, { "epoch": 0.22295827565750445, "step": 2255, "train/sim_loss": 0.0703125 }, { "epoch": 0.22295827565750445, "step": 2255, "train/total_loss": 0.14818903803825378 }, { "entropy": 8.875467300415039, "epoch": 0.22305714850701996, "mean_token_accuracy": 0.6974459886550903, "num_tokens": 12399697.0, "step": 2256, "train/ce_loss": 0.6050071120262146 }, { "epoch": 0.22305714850701996, "step": 2256, "train/sim_loss": 0.0546875 }, { "epoch": 0.22305714850701996, "step": 2256, "train/total_loss": 0.11518821120262146 }, { "entropy": 9.304447174072266, "epoch": 0.2231560213565355, "mean_token_accuracy": 0.7236841917037964, "num_tokens": 12405021.0, "step": 2257, "train/ce_loss": 0.6550213694572449 }, { "epoch": 0.2231560213565355, "step": 2257, "train/sim_loss": 0.0546875 }, { "epoch": 0.2231560213565355, "step": 2257, "train/total_loss": 0.12018963694572449 }, { "entropy": 9.21230411529541, "epoch": 0.22325489420605102, "mean_token_accuracy": 0.7600459456443787, "num_tokens": 12410512.0, "step": 2258, "train/ce_loss": 0.9499570727348328 }, { "epoch": 0.22325489420605102, "step": 2258, "train/sim_loss": 0.06640625 }, { "epoch": 0.22325489420605102, "step": 2258, "train/total_loss": 0.16140195727348328 }, { "entropy": 9.073302268981934, "epoch": 0.22335376705556653, "mean_token_accuracy": 0.7894179821014404, "num_tokens": 12416069.0, "step": 2259, "train/ce_loss": 0.8208868503570557 }, { "epoch": 0.22335376705556653, "step": 2259, "train/sim_loss": 0.09375 }, { "epoch": 0.22335376705556653, "step": 2259, "train/total_loss": 0.1758386790752411 }, { "epoch": 0.22345263990508207, "grad_norm": 0.734729528427124, "learning_rate": 9.44394995796865e-06, "loss": 0.1532, "step": 2260 }, { "entropy": 9.478494644165039, "epoch": 0.22345263990508207, "mean_token_accuracy": 0.7054794430732727, "num_tokens": 12421395.0, "step": 2260, "train/ce_loss": 0.9154320359230042 }, { "epoch": 0.22345263990508207, "step": 2260, "train/sim_loss": 0.08984375 }, { "epoch": 0.22345263990508207, "step": 2260, "train/total_loss": 0.18138694763183594 }, { "entropy": 9.195255279541016, "epoch": 0.2235515127545976, "mean_token_accuracy": 0.743888258934021, "num_tokens": 12426899.0, "step": 2261, "train/ce_loss": 1.296502709388733 }, { "epoch": 0.2235515127545976, "step": 2261, "train/sim_loss": 0.12109375 }, { "epoch": 0.2235515127545976, "step": 2261, "train/total_loss": 0.2507440447807312 }, { "entropy": 8.894723892211914, "epoch": 0.2236503856041131, "mean_token_accuracy": 0.7857838273048401, "num_tokens": 12432555.0, "step": 2262, "train/ce_loss": 0.6555893421173096 }, { "epoch": 0.2236503856041131, "step": 2262, "train/sim_loss": 0.03515625 }, { "epoch": 0.2236503856041131, "step": 2262, "train/total_loss": 0.10071518272161484 }, { "entropy": 9.05888557434082, "epoch": 0.22374925845362864, "mean_token_accuracy": 0.7851239442825317, "num_tokens": 12438115.0, "step": 2263, "train/ce_loss": 0.43059781193733215 }, { "epoch": 0.22374925845362864, "step": 2263, "train/sim_loss": 0.03125 }, { "epoch": 0.22374925845362864, "step": 2263, "train/total_loss": 0.07430978119373322 }, { "entropy": 9.027145385742188, "epoch": 0.22384813130314415, "mean_token_accuracy": 0.7180043458938599, "num_tokens": 12443662.0, "step": 2264, "train/ce_loss": 0.5323383808135986 }, { "epoch": 0.22384813130314415, "step": 2264, "train/sim_loss": 0.06640625 }, { "epoch": 0.22384813130314415, "step": 2264, "train/total_loss": 0.11964008957147598 }, { "entropy": 9.1109619140625, "epoch": 0.22394700415265967, "mean_token_accuracy": 0.7289156913757324, "num_tokens": 12449135.0, "step": 2265, "train/ce_loss": 1.0178813934326172 }, { "epoch": 0.22394700415265967, "step": 2265, "train/sim_loss": 0.0546875 }, { "epoch": 0.22394700415265967, "step": 2265, "train/total_loss": 0.15647563338279724 }, { "entropy": 9.298486709594727, "epoch": 0.2240458770021752, "mean_token_accuracy": 0.7133917212486267, "num_tokens": 12454557.0, "step": 2266, "train/ce_loss": 1.3623218536376953 }, { "epoch": 0.2240458770021752, "step": 2266, "train/sim_loss": 0.08984375 }, { "epoch": 0.2240458770021752, "step": 2266, "train/total_loss": 0.2260759323835373 }, { "entropy": 9.20012092590332, "epoch": 0.22414474985169072, "mean_token_accuracy": 0.7002341747283936, "num_tokens": 12459953.0, "step": 2267, "train/ce_loss": 1.0680876970291138 }, { "epoch": 0.22414474985169072, "step": 2267, "train/sim_loss": 0.05078125 }, { "epoch": 0.22414474985169072, "step": 2267, "train/total_loss": 0.15759003162384033 }, { "entropy": 9.154695510864258, "epoch": 0.22424362270120624, "mean_token_accuracy": 0.7816979289054871, "num_tokens": 12465458.0, "step": 2268, "train/ce_loss": 0.5780136585235596 }, { "epoch": 0.22424362270120624, "step": 2268, "train/sim_loss": 0.01953125 }, { "epoch": 0.22424362270120624, "step": 2268, "train/total_loss": 0.07733261585235596 }, { "entropy": 9.398964881896973, "epoch": 0.22434249555072178, "mean_token_accuracy": 0.7496774196624756, "num_tokens": 12470816.0, "step": 2269, "train/ce_loss": 0.504913866519928 }, { "epoch": 0.22434249555072178, "step": 2269, "train/sim_loss": 0.078125 }, { "epoch": 0.22434249555072178, "step": 2269, "train/total_loss": 0.12861639261245728 }, { "entropy": 9.024200439453125, "epoch": 0.2244413684002373, "mean_token_accuracy": 0.6756756901741028, "num_tokens": 12476250.0, "step": 2270, "train/ce_loss": 2.144893169403076 }, { "epoch": 0.2244413684002373, "step": 2270, "train/sim_loss": 0.0546875 }, { "epoch": 0.2244413684002373, "step": 2270, "train/total_loss": 0.2691768407821655 }, { "entropy": 9.100531578063965, "epoch": 0.22454024124975283, "mean_token_accuracy": 0.760401725769043, "num_tokens": 12481609.0, "step": 2271, "train/ce_loss": 0.8023784160614014 }, { "epoch": 0.22454024124975283, "step": 2271, "train/sim_loss": 0.05859375 }, { "epoch": 0.22454024124975283, "step": 2271, "train/total_loss": 0.13883158564567566 }, { "entropy": 9.063339233398438, "epoch": 0.22463911409926834, "mean_token_accuracy": 0.7334035634994507, "num_tokens": 12487137.0, "step": 2272, "train/ce_loss": 0.45443716645240784 }, { "epoch": 0.22463911409926834, "step": 2272, "train/sim_loss": 0.015625 }, { "epoch": 0.22463911409926834, "step": 2272, "train/total_loss": 0.06106871739029884 }, { "entropy": 9.078593254089355, "epoch": 0.22473798694878386, "mean_token_accuracy": 0.7515225410461426, "num_tokens": 12492608.0, "step": 2273, "train/ce_loss": 1.0920050144195557 }, { "epoch": 0.22473798694878386, "step": 2273, "train/sim_loss": 0.046875 }, { "epoch": 0.22473798694878386, "step": 2273, "train/total_loss": 0.15607550740242004 }, { "entropy": 9.229888916015625, "epoch": 0.2248368597982994, "mean_token_accuracy": 0.7157652378082275, "num_tokens": 12498075.0, "step": 2274, "train/ce_loss": 0.4786331355571747 }, { "epoch": 0.2248368597982994, "step": 2274, "train/sim_loss": 0.06640625 }, { "epoch": 0.2248368597982994, "step": 2274, "train/total_loss": 0.11426956951618195 }, { "entropy": 9.380385398864746, "epoch": 0.2249357326478149, "mean_token_accuracy": 0.7984085083007812, "num_tokens": 12503455.0, "step": 2275, "train/ce_loss": 0.5212197303771973 }, { "epoch": 0.2249357326478149, "step": 2275, "train/sim_loss": 0.03125 }, { "epoch": 0.2249357326478149, "step": 2275, "train/total_loss": 0.08337197452783585 }, { "entropy": 8.928854942321777, "epoch": 0.22503460549733043, "mean_token_accuracy": 0.7399380803108215, "num_tokens": 12509065.0, "step": 2276, "train/ce_loss": 1.4771265983581543 }, { "epoch": 0.22503460549733043, "step": 2276, "train/sim_loss": 0.125 }, { "epoch": 0.22503460549733043, "step": 2276, "train/total_loss": 0.2727126479148865 }, { "entropy": 9.090124130249023, "epoch": 0.22513347834684597, "mean_token_accuracy": 0.7287173867225647, "num_tokens": 12514550.0, "step": 2277, "train/ce_loss": 0.9310904145240784 }, { "epoch": 0.22513347834684597, "step": 2277, "train/sim_loss": 0.0703125 }, { "epoch": 0.22513347834684597, "step": 2277, "train/total_loss": 0.16342154145240784 }, { "entropy": 8.783184051513672, "epoch": 0.22523235119636148, "mean_token_accuracy": 0.7382715940475464, "num_tokens": 12520382.0, "step": 2278, "train/ce_loss": 0.7271852493286133 }, { "epoch": 0.22523235119636148, "step": 2278, "train/sim_loss": 0.1015625 }, { "epoch": 0.22523235119636148, "step": 2278, "train/total_loss": 0.1742810308933258 }, { "entropy": 9.099418640136719, "epoch": 0.225331224045877, "mean_token_accuracy": 0.725882351398468, "num_tokens": 12525837.0, "step": 2279, "train/ce_loss": 0.947310209274292 }, { "epoch": 0.225331224045877, "step": 2279, "train/sim_loss": 0.08203125 }, { "epoch": 0.225331224045877, "step": 2279, "train/total_loss": 0.17676228284835815 }, { "epoch": 0.22543009689539253, "grad_norm": 0.8758128881454468, "learning_rate": 9.439005093210701e-06, "loss": 0.1473, "step": 2280 }, { "entropy": 9.454187393188477, "epoch": 0.22543009689539253, "mean_token_accuracy": 0.7492917776107788, "num_tokens": 12531174.0, "step": 2280, "train/ce_loss": 0.5745047926902771 }, { "epoch": 0.22543009689539253, "step": 2280, "train/sim_loss": 0.0390625 }, { "epoch": 0.22543009689539253, "step": 2280, "train/total_loss": 0.09651298075914383 }, { "entropy": 8.575397491455078, "epoch": 0.22552896974490805, "mean_token_accuracy": 0.7523089647293091, "num_tokens": 12537025.0, "step": 2281, "train/ce_loss": 0.7930688261985779 }, { "epoch": 0.22552896974490805, "step": 2281, "train/sim_loss": 0.125 }, { "epoch": 0.22552896974490805, "step": 2281, "train/total_loss": 0.20430688560009003 }, { "entropy": 9.189189910888672, "epoch": 0.22562784259442356, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 12542431.0, "step": 2282, "train/ce_loss": 0.5909114480018616 }, { "epoch": 0.22562784259442356, "step": 2282, "train/sim_loss": 0.04296875 }, { "epoch": 0.22562784259442356, "step": 2282, "train/total_loss": 0.10205990076065063 }, { "entropy": 9.418121337890625, "epoch": 0.2257267154439391, "mean_token_accuracy": 0.7157622575759888, "num_tokens": 12547818.0, "step": 2283, "train/ce_loss": 0.7697886228561401 }, { "epoch": 0.2257267154439391, "step": 2283, "train/sim_loss": 0.03125 }, { "epoch": 0.2257267154439391, "step": 2283, "train/total_loss": 0.10822886228561401 }, { "entropy": 8.995122909545898, "epoch": 0.22582558829345462, "mean_token_accuracy": 0.7061704397201538, "num_tokens": 12553455.0, "step": 2284, "train/ce_loss": 0.6357759237289429 }, { "epoch": 0.22582558829345462, "step": 2284, "train/sim_loss": 0.02734375 }, { "epoch": 0.22582558829345462, "step": 2284, "train/total_loss": 0.09092134237289429 }, { "entropy": 8.817176818847656, "epoch": 0.22592446114297013, "mean_token_accuracy": 0.7176781296730042, "num_tokens": 12559170.0, "step": 2285, "train/ce_loss": 0.6062124371528625 }, { "epoch": 0.22592446114297013, "step": 2285, "train/sim_loss": 0.02734375 }, { "epoch": 0.22592446114297013, "step": 2285, "train/total_loss": 0.0879649966955185 }, { "entropy": 9.072698593139648, "epoch": 0.22602333399248567, "mean_token_accuracy": 0.7051129341125488, "num_tokens": 12564674.0, "step": 2286, "train/ce_loss": 0.9851348996162415 }, { "epoch": 0.22602333399248567, "step": 2286, "train/sim_loss": 0.0625 }, { "epoch": 0.22602333399248567, "step": 2286, "train/total_loss": 0.16101348400115967 }, { "entropy": 9.275079727172852, "epoch": 0.22612220684200118, "mean_token_accuracy": 0.7016574740409851, "num_tokens": 12570035.0, "step": 2287, "train/ce_loss": 0.7606014013290405 }, { "epoch": 0.22612220684200118, "step": 2287, "train/sim_loss": 0.07421875 }, { "epoch": 0.22612220684200118, "step": 2287, "train/total_loss": 0.15027889609336853 }, { "entropy": 9.301481246948242, "epoch": 0.2262210796915167, "mean_token_accuracy": 0.6852540373802185, "num_tokens": 12575334.0, "step": 2288, "train/ce_loss": 0.8031681776046753 }, { "epoch": 0.2262210796915167, "step": 2288, "train/sim_loss": 0.0703125 }, { "epoch": 0.2262210796915167, "step": 2288, "train/total_loss": 0.15062931180000305 }, { "entropy": 8.809331893920898, "epoch": 0.22631995254103224, "mean_token_accuracy": 0.73113614320755, "num_tokens": 12581081.0, "step": 2289, "train/ce_loss": 0.45368891954421997 }, { "epoch": 0.22631995254103224, "step": 2289, "train/sim_loss": 0.08203125 }, { "epoch": 0.22631995254103224, "step": 2289, "train/total_loss": 0.12740014493465424 }, { "entropy": 9.400045394897461, "epoch": 0.22641882539054775, "mean_token_accuracy": 0.7509340047836304, "num_tokens": 12586417.0, "step": 2290, "train/ce_loss": 0.3098753094673157 }, { "epoch": 0.22641882539054775, "step": 2290, "train/sim_loss": 0.0703125 }, { "epoch": 0.22641882539054775, "step": 2290, "train/total_loss": 0.10130003094673157 }, { "entropy": 8.936885833740234, "epoch": 0.2265176982400633, "mean_token_accuracy": 0.7293519973754883, "num_tokens": 12591839.0, "step": 2291, "train/ce_loss": 1.123040795326233 }, { "epoch": 0.2265176982400633, "step": 2291, "train/sim_loss": 0.078125 }, { "epoch": 0.2265176982400633, "step": 2291, "train/total_loss": 0.19042909145355225 }, { "entropy": 9.242895126342773, "epoch": 0.2266165710895788, "mean_token_accuracy": 0.6427605152130127, "num_tokens": 12597200.0, "step": 2292, "train/ce_loss": 1.3863061666488647 }, { "epoch": 0.2266165710895788, "step": 2292, "train/sim_loss": 0.0703125 }, { "epoch": 0.2266165710895788, "step": 2292, "train/total_loss": 0.20894311368465424 }, { "entropy": 9.469788551330566, "epoch": 0.22671544393909432, "mean_token_accuracy": 0.8050397634506226, "num_tokens": 12602659.0, "step": 2293, "train/ce_loss": 0.6933088898658752 }, { "epoch": 0.22671544393909432, "step": 2293, "train/sim_loss": 0.0625 }, { "epoch": 0.22671544393909432, "step": 2293, "train/total_loss": 0.13183090090751648 }, { "entropy": 8.846986770629883, "epoch": 0.22681431678860986, "mean_token_accuracy": 0.7465388774871826, "num_tokens": 12608235.0, "step": 2294, "train/ce_loss": 0.8286305069923401 }, { "epoch": 0.22681431678860986, "step": 2294, "train/sim_loss": 0.1015625 }, { "epoch": 0.22681431678860986, "step": 2294, "train/total_loss": 0.18442556262016296 }, { "entropy": 9.423431396484375, "epoch": 0.22691318963812537, "mean_token_accuracy": 0.7910447716712952, "num_tokens": 12613533.0, "step": 2295, "train/ce_loss": 0.8010938167572021 }, { "epoch": 0.22691318963812537, "step": 2295, "train/sim_loss": 0.05859375 }, { "epoch": 0.22691318963812537, "step": 2295, "train/total_loss": 0.1387031376361847 }, { "entropy": 8.834715843200684, "epoch": 0.2270120624876409, "mean_token_accuracy": 0.6977687478065491, "num_tokens": 12619106.0, "step": 2296, "train/ce_loss": 0.4932561218738556 }, { "epoch": 0.2270120624876409, "step": 2296, "train/sim_loss": 0.0546875 }, { "epoch": 0.2270120624876409, "step": 2296, "train/total_loss": 0.1040131151676178 }, { "entropy": 9.5335111618042, "epoch": 0.22711093533715643, "mean_token_accuracy": 0.7635829448699951, "num_tokens": 12624348.0, "step": 2297, "train/ce_loss": 0.981454610824585 }, { "epoch": 0.22711093533715643, "step": 2297, "train/sim_loss": 0.0625 }, { "epoch": 0.22711093533715643, "step": 2297, "train/total_loss": 0.16064545512199402 }, { "entropy": 9.352757453918457, "epoch": 0.22720980818667194, "mean_token_accuracy": 0.733668327331543, "num_tokens": 12629907.0, "step": 2298, "train/ce_loss": 1.1511225700378418 }, { "epoch": 0.22720980818667194, "step": 2298, "train/sim_loss": 0.05859375 }, { "epoch": 0.22720980818667194, "step": 2298, "train/total_loss": 0.17370600998401642 }, { "entropy": 9.12146282196045, "epoch": 0.22730868103618745, "mean_token_accuracy": 0.6919739842414856, "num_tokens": 12635494.0, "step": 2299, "train/ce_loss": 1.025706171989441 }, { "epoch": 0.22730868103618745, "step": 2299, "train/sim_loss": 0.0390625 }, { "epoch": 0.22730868103618745, "step": 2299, "train/total_loss": 0.14163312315940857 }, { "epoch": 0.227407553885703, "grad_norm": 1.2076016664505005, "learning_rate": 9.434060228452752e-06, "loss": 0.1605, "step": 2300 }, { "entropy": 9.398931503295898, "epoch": 0.227407553885703, "mean_token_accuracy": 0.7666248679161072, "num_tokens": 12640897.0, "step": 2300, "train/ce_loss": 0.42762455344200134 }, { "epoch": 0.227407553885703, "step": 2300, "train/sim_loss": 0.0234375 }, { "epoch": 0.227407553885703, "step": 2300, "train/total_loss": 0.06619995832443237 }, { "entropy": 9.324003219604492, "epoch": 0.2275064267352185, "mean_token_accuracy": 0.757969319820404, "num_tokens": 12646254.0, "step": 2301, "train/ce_loss": 1.2006574869155884 }, { "epoch": 0.2275064267352185, "step": 2301, "train/sim_loss": 0.10546875 }, { "epoch": 0.2275064267352185, "step": 2301, "train/total_loss": 0.22553449869155884 }, { "entropy": 9.270509719848633, "epoch": 0.22760529958473402, "mean_token_accuracy": 0.7219321131706238, "num_tokens": 12651538.0, "step": 2302, "train/ce_loss": 1.9129657745361328 }, { "epoch": 0.22760529958473402, "step": 2302, "train/sim_loss": 0.07421875 }, { "epoch": 0.22760529958473402, "step": 2302, "train/total_loss": 0.2655153274536133 }, { "entropy": 9.254439353942871, "epoch": 0.22770417243424956, "mean_token_accuracy": 0.7776456475257874, "num_tokens": 12656949.0, "step": 2303, "train/ce_loss": 0.7876937389373779 }, { "epoch": 0.22770417243424956, "step": 2303, "train/sim_loss": 0.0703125 }, { "epoch": 0.22770417243424956, "step": 2303, "train/total_loss": 0.14908188581466675 }, { "entropy": 9.313733100891113, "epoch": 0.22780304528376508, "mean_token_accuracy": 0.75, "num_tokens": 12662318.0, "step": 2304, "train/ce_loss": 1.0245161056518555 }, { "epoch": 0.22780304528376508, "step": 2304, "train/sim_loss": 0.046875 }, { "epoch": 0.22780304528376508, "step": 2304, "train/total_loss": 0.1493266224861145 }, { "entropy": 9.397725105285645, "epoch": 0.2279019181332806, "mean_token_accuracy": 0.7517337203025818, "num_tokens": 12667803.0, "step": 2305, "train/ce_loss": 0.5887020230293274 }, { "epoch": 0.2279019181332806, "step": 2305, "train/sim_loss": 0.06640625 }, { "epoch": 0.2279019181332806, "step": 2305, "train/total_loss": 0.12527644634246826 }, { "entropy": 9.331926345825195, "epoch": 0.22800079098279613, "mean_token_accuracy": 0.7028014659881592, "num_tokens": 12673203.0, "step": 2306, "train/ce_loss": 1.2137951850891113 }, { "epoch": 0.22800079098279613, "step": 2306, "train/sim_loss": 0.0546875 }, { "epoch": 0.22800079098279613, "step": 2306, "train/total_loss": 0.1760670244693756 }, { "entropy": 8.923837661743164, "epoch": 0.22809966383231164, "mean_token_accuracy": 0.765656590461731, "num_tokens": 12678838.0, "step": 2307, "train/ce_loss": 0.6049782037734985 }, { "epoch": 0.22809966383231164, "step": 2307, "train/sim_loss": 0.01953125 }, { "epoch": 0.22809966383231164, "step": 2307, "train/total_loss": 0.08002907037734985 }, { "entropy": 9.168447494506836, "epoch": 0.22819853668182716, "mean_token_accuracy": 0.7530487775802612, "num_tokens": 12684486.0, "step": 2308, "train/ce_loss": 0.6704729199409485 }, { "epoch": 0.22819853668182716, "step": 2308, "train/sim_loss": 0.12109375 }, { "epoch": 0.22819853668182716, "step": 2308, "train/total_loss": 0.18814104795455933 }, { "entropy": 9.234230995178223, "epoch": 0.2282974095313427, "mean_token_accuracy": 0.6821885704994202, "num_tokens": 12689884.0, "step": 2309, "train/ce_loss": 0.5187104344367981 }, { "epoch": 0.2282974095313427, "step": 2309, "train/sim_loss": 0.02734375 }, { "epoch": 0.2282974095313427, "step": 2309, "train/total_loss": 0.07921479642391205 }, { "entropy": 9.33066463470459, "epoch": 0.2283962823808582, "mean_token_accuracy": 0.7295188307762146, "num_tokens": 12695253.0, "step": 2310, "train/ce_loss": 1.1207504272460938 }, { "epoch": 0.2283962823808582, "step": 2310, "train/sim_loss": 0.109375 }, { "epoch": 0.2283962823808582, "step": 2310, "train/total_loss": 0.2214500457048416 }, { "entropy": 8.699488639831543, "epoch": 0.22849515523037375, "mean_token_accuracy": 0.6997269988059998, "num_tokens": 12700923.0, "step": 2311, "train/ce_loss": 1.1485005617141724 }, { "epoch": 0.22849515523037375, "step": 2311, "train/sim_loss": 0.078125 }, { "epoch": 0.22849515523037375, "step": 2311, "train/total_loss": 0.19297505915164948 }, { "entropy": 8.623822212219238, "epoch": 0.22859402807988927, "mean_token_accuracy": 0.7919227480888367, "num_tokens": 12706802.0, "step": 2312, "train/ce_loss": 0.6145561933517456 }, { "epoch": 0.22859402807988927, "step": 2312, "train/sim_loss": 0.10546875 }, { "epoch": 0.22859402807988927, "step": 2312, "train/total_loss": 0.1669243723154068 }, { "entropy": 9.149600982666016, "epoch": 0.22869290092940478, "mean_token_accuracy": 0.7247312068939209, "num_tokens": 12712355.0, "step": 2313, "train/ce_loss": 1.1137343645095825 }, { "epoch": 0.22869290092940478, "step": 2313, "train/sim_loss": 0.0703125 }, { "epoch": 0.22869290092940478, "step": 2313, "train/total_loss": 0.1816859394311905 }, { "entropy": 9.075239181518555, "epoch": 0.22879177377892032, "mean_token_accuracy": 0.7317620515823364, "num_tokens": 12717831.0, "step": 2314, "train/ce_loss": 0.5633830428123474 }, { "epoch": 0.22879177377892032, "step": 2314, "train/sim_loss": 0.09375 }, { "epoch": 0.22879177377892032, "step": 2314, "train/total_loss": 0.15008831024169922 }, { "entropy": 8.852270126342773, "epoch": 0.22889064662843583, "mean_token_accuracy": 0.8066801428794861, "num_tokens": 12723543.0, "step": 2315, "train/ce_loss": 0.5033262372016907 }, { "epoch": 0.22889064662843583, "step": 2315, "train/sim_loss": 0.046875 }, { "epoch": 0.22889064662843583, "step": 2315, "train/total_loss": 0.09720762073993683 }, { "entropy": 9.19036865234375, "epoch": 0.22898951947795135, "mean_token_accuracy": 0.7713004350662231, "num_tokens": 12729089.0, "step": 2316, "train/ce_loss": 1.0367661714553833 }, { "epoch": 0.22898951947795135, "step": 2316, "train/sim_loss": 0.01953125 }, { "epoch": 0.22898951947795135, "step": 2316, "train/total_loss": 0.12320786714553833 }, { "entropy": 9.373205184936523, "epoch": 0.2290883923274669, "mean_token_accuracy": 0.7594278454780579, "num_tokens": 12734445.0, "step": 2317, "train/ce_loss": 1.1535327434539795 }, { "epoch": 0.2290883923274669, "step": 2317, "train/sim_loss": 0.1015625 }, { "epoch": 0.2290883923274669, "step": 2317, "train/total_loss": 0.2169157862663269 }, { "entropy": 9.357789993286133, "epoch": 0.2291872651769824, "mean_token_accuracy": 0.7463414669036865, "num_tokens": 12739903.0, "step": 2318, "train/ce_loss": 0.6449527144432068 }, { "epoch": 0.2291872651769824, "step": 2318, "train/sim_loss": 0.078125 }, { "epoch": 0.2291872651769824, "step": 2318, "train/total_loss": 0.1426202654838562 }, { "entropy": 9.50872802734375, "epoch": 0.22928613802649792, "mean_token_accuracy": 0.7472222447395325, "num_tokens": 12745406.0, "step": 2319, "train/ce_loss": 0.5848165154457092 }, { "epoch": 0.22928613802649792, "step": 2319, "train/sim_loss": 0.07421875 }, { "epoch": 0.22928613802649792, "step": 2319, "train/total_loss": 0.13270039856433868 }, { "epoch": 0.22938501087601346, "grad_norm": 1.160451889038086, "learning_rate": 9.429115363694804e-06, "loss": 0.1495, "step": 2320 }, { "entropy": 8.642803192138672, "epoch": 0.22938501087601346, "mean_token_accuracy": 0.7518518567085266, "num_tokens": 12751170.0, "step": 2320, "train/ce_loss": 0.5819380283355713 }, { "epoch": 0.22938501087601346, "step": 2320, "train/sim_loss": 0.03515625 }, { "epoch": 0.22938501087601346, "step": 2320, "train/total_loss": 0.09335005283355713 }, { "entropy": 9.47571086883545, "epoch": 0.22948388372552897, "mean_token_accuracy": 0.8189415335655212, "num_tokens": 12756436.0, "step": 2321, "train/ce_loss": 0.6250624060630798 }, { "epoch": 0.22948388372552897, "step": 2321, "train/sim_loss": 0.0625 }, { "epoch": 0.22948388372552897, "step": 2321, "train/total_loss": 0.12500624358654022 }, { "entropy": 9.19014835357666, "epoch": 0.22958275657504448, "mean_token_accuracy": 0.7273755669593811, "num_tokens": 12761956.0, "step": 2322, "train/ce_loss": 0.6594789028167725 }, { "epoch": 0.22958275657504448, "step": 2322, "train/sim_loss": 0.05078125 }, { "epoch": 0.22958275657504448, "step": 2322, "train/total_loss": 0.11672914028167725 }, { "entropy": 9.314668655395508, "epoch": 0.22968162942456002, "mean_token_accuracy": 0.7466487884521484, "num_tokens": 12767347.0, "step": 2323, "train/ce_loss": 0.5167045593261719 }, { "epoch": 0.22968162942456002, "step": 2323, "train/sim_loss": 0.0390625 }, { "epoch": 0.22968162942456002, "step": 2323, "train/total_loss": 0.09073296189308167 }, { "entropy": 9.255136489868164, "epoch": 0.22978050227407554, "mean_token_accuracy": 0.7534246444702148, "num_tokens": 12772823.0, "step": 2324, "train/ce_loss": 0.7643610239028931 }, { "epoch": 0.22978050227407554, "step": 2324, "train/sim_loss": 0.03515625 }, { "epoch": 0.22978050227407554, "step": 2324, "train/total_loss": 0.1115923523902893 }, { "entropy": 9.162620544433594, "epoch": 0.22987937512359105, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 12778349.0, "step": 2325, "train/ce_loss": 0.9240086674690247 }, { "epoch": 0.22987937512359105, "step": 2325, "train/sim_loss": 0.046875 }, { "epoch": 0.22987937512359105, "step": 2325, "train/total_loss": 0.13927587866783142 }, { "entropy": 9.109992980957031, "epoch": 0.2299782479731066, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 12783949.0, "step": 2326, "train/ce_loss": 0.5358572602272034 }, { "epoch": 0.2299782479731066, "step": 2326, "train/sim_loss": 0.04296875 }, { "epoch": 0.2299782479731066, "step": 2326, "train/total_loss": 0.0965544730424881 }, { "entropy": 9.167407989501953, "epoch": 0.2300771208226221, "mean_token_accuracy": 0.7378318309783936, "num_tokens": 12789462.0, "step": 2327, "train/ce_loss": 1.6126677989959717 }, { "epoch": 0.2300771208226221, "step": 2327, "train/sim_loss": 0.078125 }, { "epoch": 0.2300771208226221, "step": 2327, "train/total_loss": 0.23939178884029388 }, { "entropy": 9.344123840332031, "epoch": 0.23017599367213762, "mean_token_accuracy": 0.7705570459365845, "num_tokens": 12794796.0, "step": 2328, "train/ce_loss": 1.3802226781845093 }, { "epoch": 0.23017599367213762, "step": 2328, "train/sim_loss": 0.1171875 }, { "epoch": 0.23017599367213762, "step": 2328, "train/total_loss": 0.2552097737789154 }, { "entropy": 9.440181732177734, "epoch": 0.23027486652165316, "mean_token_accuracy": 0.7867177724838257, "num_tokens": 12800181.0, "step": 2329, "train/ce_loss": 0.3156089186668396 }, { "epoch": 0.23027486652165316, "step": 2329, "train/sim_loss": 0.07421875 }, { "epoch": 0.23027486652165316, "step": 2329, "train/total_loss": 0.10577964782714844 }, { "entropy": 9.165396690368652, "epoch": 0.23037373937116867, "mean_token_accuracy": 0.7822120785713196, "num_tokens": 12805672.0, "step": 2330, "train/ce_loss": 0.9759741425514221 }, { "epoch": 0.23037373937116867, "step": 2330, "train/sim_loss": 0.0546875 }, { "epoch": 0.23037373937116867, "step": 2330, "train/total_loss": 0.1522849202156067 }, { "entropy": 9.182999610900879, "epoch": 0.2304726122206842, "mean_token_accuracy": 0.7949367165565491, "num_tokens": 12811067.0, "step": 2331, "train/ce_loss": 0.47858527302742004 }, { "epoch": 0.2304726122206842, "step": 2331, "train/sim_loss": 0.0625 }, { "epoch": 0.2304726122206842, "step": 2331, "train/total_loss": 0.11035852879285812 }, { "entropy": 8.990257263183594, "epoch": 0.23057148507019973, "mean_token_accuracy": 0.726141095161438, "num_tokens": 12816663.0, "step": 2332, "train/ce_loss": 1.2566879987716675 }, { "epoch": 0.23057148507019973, "step": 2332, "train/sim_loss": 0.109375 }, { "epoch": 0.23057148507019973, "step": 2332, "train/total_loss": 0.23504380881786346 }, { "entropy": 9.463262557983398, "epoch": 0.23067035791971524, "mean_token_accuracy": 0.7192053198814392, "num_tokens": 12822002.0, "step": 2333, "train/ce_loss": 0.9948331117630005 }, { "epoch": 0.23067035791971524, "step": 2333, "train/sim_loss": 0.06640625 }, { "epoch": 0.23067035791971524, "step": 2333, "train/total_loss": 0.16588956117630005 }, { "entropy": 9.16469955444336, "epoch": 0.23076923076923078, "mean_token_accuracy": 0.746835470199585, "num_tokens": 12827453.0, "step": 2334, "train/ce_loss": 0.544058620929718 }, { "epoch": 0.23076923076923078, "step": 2334, "train/sim_loss": 0.01953125 }, { "epoch": 0.23076923076923078, "step": 2334, "train/total_loss": 0.07393711805343628 }, { "entropy": 8.891324043273926, "epoch": 0.2308681036187463, "mean_token_accuracy": 0.6991869807243347, "num_tokens": 12833024.0, "step": 2335, "train/ce_loss": 1.024863600730896 }, { "epoch": 0.2308681036187463, "step": 2335, "train/sim_loss": 0.08984375 }, { "epoch": 0.2308681036187463, "step": 2335, "train/total_loss": 0.19233012199401855 }, { "entropy": 9.18754768371582, "epoch": 0.2309669764682618, "mean_token_accuracy": 0.7582159638404846, "num_tokens": 12838510.0, "step": 2336, "train/ce_loss": 0.49076056480407715 }, { "epoch": 0.2309669764682618, "step": 2336, "train/sim_loss": 0.02734375 }, { "epoch": 0.2309669764682618, "step": 2336, "train/total_loss": 0.07641980797052383 }, { "entropy": 8.878618240356445, "epoch": 0.23106584931777735, "mean_token_accuracy": 0.748140275478363, "num_tokens": 12844134.0, "step": 2337, "train/ce_loss": 0.8835176825523376 }, { "epoch": 0.23106584931777735, "step": 2337, "train/sim_loss": 0.0859375 }, { "epoch": 0.23106584931777735, "step": 2337, "train/total_loss": 0.174289271235466 }, { "entropy": 9.275635719299316, "epoch": 0.23116472216729286, "mean_token_accuracy": 0.7485101222991943, "num_tokens": 12849600.0, "step": 2338, "train/ce_loss": 0.7616112232208252 }, { "epoch": 0.23116472216729286, "step": 2338, "train/sim_loss": 0.08203125 }, { "epoch": 0.23116472216729286, "step": 2338, "train/total_loss": 0.15819236636161804 }, { "entropy": 9.463948249816895, "epoch": 0.23126359501680838, "mean_token_accuracy": 0.7503506541252136, "num_tokens": 12854950.0, "step": 2339, "train/ce_loss": 0.6837087273597717 }, { "epoch": 0.23126359501680838, "step": 2339, "train/sim_loss": 0.03125 }, { "epoch": 0.23126359501680838, "step": 2339, "train/total_loss": 0.09962087124586105 }, { "epoch": 0.23136246786632392, "grad_norm": 0.8150753378868103, "learning_rate": 9.424170498936855e-06, "loss": 0.1407, "step": 2340 }, { "entropy": 8.898162841796875, "epoch": 0.23136246786632392, "mean_token_accuracy": 0.7046511769294739, "num_tokens": 12860478.0, "step": 2340, "train/ce_loss": 0.9486772418022156 }, { "epoch": 0.23136246786632392, "step": 2340, "train/sim_loss": 0.08984375 }, { "epoch": 0.23136246786632392, "step": 2340, "train/total_loss": 0.1847114861011505 }, { "entropy": 8.788373947143555, "epoch": 0.23146134071583943, "mean_token_accuracy": 0.8247232437133789, "num_tokens": 12866215.0, "step": 2341, "train/ce_loss": 0.43347957730293274 }, { "epoch": 0.23146134071583943, "step": 2341, "train/sim_loss": 0.11328125 }, { "epoch": 0.23146134071583943, "step": 2341, "train/total_loss": 0.15662920475006104 }, { "entropy": 8.890182495117188, "epoch": 0.23156021356535494, "mean_token_accuracy": 0.7234513163566589, "num_tokens": 12871747.0, "step": 2342, "train/ce_loss": 0.504688024520874 }, { "epoch": 0.23156021356535494, "step": 2342, "train/sim_loss": 0.05859375 }, { "epoch": 0.23156021356535494, "step": 2342, "train/total_loss": 0.1090625524520874 }, { "entropy": 8.810118675231934, "epoch": 0.23165908641487049, "mean_token_accuracy": 0.7386046648025513, "num_tokens": 12877354.0, "step": 2343, "train/ce_loss": 0.601392388343811 }, { "epoch": 0.23165908641487049, "step": 2343, "train/sim_loss": 0.06640625 }, { "epoch": 0.23165908641487049, "step": 2343, "train/total_loss": 0.1265454888343811 }, { "entropy": 9.072904586791992, "epoch": 0.231757959264386, "mean_token_accuracy": 0.7885462641716003, "num_tokens": 12882871.0, "step": 2344, "train/ce_loss": 1.036359190940857 }, { "epoch": 0.231757959264386, "step": 2344, "train/sim_loss": 0.03515625 }, { "epoch": 0.231757959264386, "step": 2344, "train/total_loss": 0.13879217207431793 }, { "entropy": 8.883110046386719, "epoch": 0.2318568321139015, "mean_token_accuracy": 0.7519920468330383, "num_tokens": 12888527.0, "step": 2345, "train/ce_loss": 0.7277324795722961 }, { "epoch": 0.2318568321139015, "step": 2345, "train/sim_loss": 0.0703125 }, { "epoch": 0.2318568321139015, "step": 2345, "train/total_loss": 0.14308574795722961 }, { "entropy": 9.254695892333984, "epoch": 0.23195570496341705, "mean_token_accuracy": 0.7663671374320984, "num_tokens": 12893945.0, "step": 2346, "train/ce_loss": 0.850159227848053 }, { "epoch": 0.23195570496341705, "step": 2346, "train/sim_loss": 0.07421875 }, { "epoch": 0.23195570496341705, "step": 2346, "train/total_loss": 0.1592346727848053 }, { "entropy": 9.169316291809082, "epoch": 0.23205457781293257, "mean_token_accuracy": 0.7187851667404175, "num_tokens": 12899430.0, "step": 2347, "train/ce_loss": 1.3320285081863403 }, { "epoch": 0.23205457781293257, "step": 2347, "train/sim_loss": 0.078125 }, { "epoch": 0.23205457781293257, "step": 2347, "train/total_loss": 0.21132785081863403 }, { "entropy": 9.16633415222168, "epoch": 0.23215345066244808, "mean_token_accuracy": 0.6817073225975037, "num_tokens": 12904904.0, "step": 2348, "train/ce_loss": 0.6820510029792786 }, { "epoch": 0.23215345066244808, "step": 2348, "train/sim_loss": 0.046875 }, { "epoch": 0.23215345066244808, "step": 2348, "train/total_loss": 0.1150801032781601 }, { "entropy": 9.18902587890625, "epoch": 0.23225232351196362, "mean_token_accuracy": 0.722482442855835, "num_tokens": 12910341.0, "step": 2349, "train/ce_loss": 1.646162509918213 }, { "epoch": 0.23225232351196362, "step": 2349, "train/sim_loss": 0.09375 }, { "epoch": 0.23225232351196362, "step": 2349, "train/total_loss": 0.25836625695228577 }, { "entropy": 9.48336410522461, "epoch": 0.23235119636147913, "mean_token_accuracy": 0.7690014839172363, "num_tokens": 12915634.0, "step": 2350, "train/ce_loss": 0.658855140209198 }, { "epoch": 0.23235119636147913, "step": 2350, "train/sim_loss": 0.09375 }, { "epoch": 0.23235119636147913, "step": 2350, "train/total_loss": 0.1596355140209198 }, { "entropy": 8.697244644165039, "epoch": 0.23245006921099465, "mean_token_accuracy": 0.6714046597480774, "num_tokens": 12921456.0, "step": 2351, "train/ce_loss": 0.8832906484603882 }, { "epoch": 0.23245006921099465, "step": 2351, "train/sim_loss": 0.0859375 }, { "epoch": 0.23245006921099465, "step": 2351, "train/total_loss": 0.17426657676696777 }, { "entropy": 9.089341163635254, "epoch": 0.2325489420605102, "mean_token_accuracy": 0.742671012878418, "num_tokens": 12927024.0, "step": 2352, "train/ce_loss": 1.1928786039352417 }, { "epoch": 0.2325489420605102, "step": 2352, "train/sim_loss": 0.05859375 }, { "epoch": 0.2325489420605102, "step": 2352, "train/total_loss": 0.1778816133737564 }, { "entropy": 9.231563568115234, "epoch": 0.2326478149100257, "mean_token_accuracy": 0.7694235444068909, "num_tokens": 12932390.0, "step": 2353, "train/ce_loss": 0.5146740674972534 }, { "epoch": 0.2326478149100257, "step": 2353, "train/sim_loss": 0.0234375 }, { "epoch": 0.2326478149100257, "step": 2353, "train/total_loss": 0.0749049037694931 }, { "entropy": 8.977638244628906, "epoch": 0.23274668775954124, "mean_token_accuracy": 0.7151370644569397, "num_tokens": 12937827.0, "step": 2354, "train/ce_loss": 1.1082602739334106 }, { "epoch": 0.23274668775954124, "step": 2354, "train/sim_loss": 0.12109375 }, { "epoch": 0.23274668775954124, "step": 2354, "train/total_loss": 0.2319197803735733 }, { "entropy": 9.140421867370605, "epoch": 0.23284556060905676, "mean_token_accuracy": 0.7488584518432617, "num_tokens": 12943340.0, "step": 2355, "train/ce_loss": 0.7756370902061462 }, { "epoch": 0.23284556060905676, "step": 2355, "train/sim_loss": 0.078125 }, { "epoch": 0.23284556060905676, "step": 2355, "train/total_loss": 0.15568870306015015 }, { "entropy": 9.187191009521484, "epoch": 0.23294443345857227, "mean_token_accuracy": 0.716152012348175, "num_tokens": 12948787.0, "step": 2356, "train/ce_loss": 0.7480189800262451 }, { "epoch": 0.23294443345857227, "step": 2356, "train/sim_loss": 0.07421875 }, { "epoch": 0.23294443345857227, "step": 2356, "train/total_loss": 0.14902064204216003 }, { "entropy": 9.33686637878418, "epoch": 0.2330433063080878, "mean_token_accuracy": 0.7378767728805542, "num_tokens": 12954214.0, "step": 2357, "train/ce_loss": 0.6189054846763611 }, { "epoch": 0.2330433063080878, "step": 2357, "train/sim_loss": 0.0703125 }, { "epoch": 0.2330433063080878, "step": 2357, "train/total_loss": 0.13220304250717163 }, { "entropy": 9.363164901733398, "epoch": 0.23314217915760332, "mean_token_accuracy": 0.7797833681106567, "num_tokens": 12959590.0, "step": 2358, "train/ce_loss": 0.6814625263214111 }, { "epoch": 0.23314217915760332, "step": 2358, "train/sim_loss": 0.06640625 }, { "epoch": 0.23314217915760332, "step": 2358, "train/total_loss": 0.1345525085926056 }, { "entropy": 9.174872398376465, "epoch": 0.23324105200711884, "mean_token_accuracy": 0.7631579041481018, "num_tokens": 12965055.0, "step": 2359, "train/ce_loss": 0.9555423259735107 }, { "epoch": 0.23324105200711884, "step": 2359, "train/sim_loss": 0.0703125 }, { "epoch": 0.23324105200711884, "step": 2359, "train/total_loss": 0.16586673259735107 }, { "epoch": 0.23333992485663438, "grad_norm": 0.9501524567604065, "learning_rate": 9.419225634178905e-06, "loss": 0.1565, "step": 2360 }, { "entropy": 9.207107543945312, "epoch": 0.23333992485663438, "mean_token_accuracy": 0.7594786882400513, "num_tokens": 12970464.0, "step": 2360, "train/ce_loss": 0.68275386095047 }, { "epoch": 0.23333992485663438, "step": 2360, "train/sim_loss": 0.0859375 }, { "epoch": 0.23333992485663438, "step": 2360, "train/total_loss": 0.15421289205551147 }, { "entropy": 9.06071662902832, "epoch": 0.2334387977061499, "mean_token_accuracy": 0.7336621284484863, "num_tokens": 12975851.0, "step": 2361, "train/ce_loss": 0.9782293438911438 }, { "epoch": 0.2334387977061499, "step": 2361, "train/sim_loss": 0.08984375 }, { "epoch": 0.2334387977061499, "step": 2361, "train/total_loss": 0.18766668438911438 }, { "entropy": 9.12070083618164, "epoch": 0.2335376705556654, "mean_token_accuracy": 0.7798672318458557, "num_tokens": 12981320.0, "step": 2362, "train/ce_loss": 0.6512324213981628 }, { "epoch": 0.2335376705556654, "step": 2362, "train/sim_loss": 0.015625 }, { "epoch": 0.2335376705556654, "step": 2362, "train/total_loss": 0.08074824512004852 }, { "entropy": 8.946319580078125, "epoch": 0.23363654340518095, "mean_token_accuracy": 0.7543323040008545, "num_tokens": 12986973.0, "step": 2363, "train/ce_loss": 0.5720033645629883 }, { "epoch": 0.23363654340518095, "step": 2363, "train/sim_loss": 0.0390625 }, { "epoch": 0.23363654340518095, "step": 2363, "train/total_loss": 0.0962628424167633 }, { "entropy": 8.840360641479492, "epoch": 0.23373541625469646, "mean_token_accuracy": 0.7388836145401001, "num_tokens": 12992604.0, "step": 2364, "train/ce_loss": 0.7574113011360168 }, { "epoch": 0.23373541625469646, "step": 2364, "train/sim_loss": 0.06640625 }, { "epoch": 0.23373541625469646, "step": 2364, "train/total_loss": 0.14214739203453064 }, { "entropy": 9.075100898742676, "epoch": 0.23383428910421197, "mean_token_accuracy": 0.7235420942306519, "num_tokens": 12998130.0, "step": 2365, "train/ce_loss": 0.6686753630638123 }, { "epoch": 0.23383428910421197, "step": 2365, "train/sim_loss": 0.03125 }, { "epoch": 0.23383428910421197, "step": 2365, "train/total_loss": 0.09811753779649734 }, { "entropy": 9.169670104980469, "epoch": 0.23393316195372751, "mean_token_accuracy": 0.7694038152694702, "num_tokens": 13003697.0, "step": 2366, "train/ce_loss": 0.872016191482544 }, { "epoch": 0.23393316195372751, "step": 2366, "train/sim_loss": 0.078125 }, { "epoch": 0.23393316195372751, "step": 2366, "train/total_loss": 0.16532662510871887 }, { "entropy": 8.90173053741455, "epoch": 0.23403203480324303, "mean_token_accuracy": 0.7220279574394226, "num_tokens": 13009437.0, "step": 2367, "train/ce_loss": 0.4223390519618988 }, { "epoch": 0.23403203480324303, "step": 2367, "train/sim_loss": 0.01953125 }, { "epoch": 0.23403203480324303, "step": 2367, "train/total_loss": 0.061765156686306 }, { "entropy": 8.981478691101074, "epoch": 0.23413090765275854, "mean_token_accuracy": 0.7014134526252747, "num_tokens": 13015168.0, "step": 2368, "train/ce_loss": 0.5289191007614136 }, { "epoch": 0.23413090765275854, "step": 2368, "train/sim_loss": 0.0625 }, { "epoch": 0.23413090765275854, "step": 2368, "train/total_loss": 0.11539191007614136 }, { "entropy": 8.837576866149902, "epoch": 0.23422978050227408, "mean_token_accuracy": 0.7599278092384338, "num_tokens": 13020911.0, "step": 2369, "train/ce_loss": 0.7737554311752319 }, { "epoch": 0.23422978050227408, "step": 2369, "train/sim_loss": 0.0234375 }, { "epoch": 0.23422978050227408, "step": 2369, "train/total_loss": 0.10081304609775543 }, { "entropy": 9.451139450073242, "epoch": 0.2343286533517896, "mean_token_accuracy": 0.7300131320953369, "num_tokens": 13026243.0, "step": 2370, "train/ce_loss": 1.3184535503387451 }, { "epoch": 0.2343286533517896, "step": 2370, "train/sim_loss": 0.05078125 }, { "epoch": 0.2343286533517896, "step": 2370, "train/total_loss": 0.1826266050338745 }, { "entropy": 9.20405387878418, "epoch": 0.2344275262013051, "mean_token_accuracy": 0.698630154132843, "num_tokens": 13031818.0, "step": 2371, "train/ce_loss": 0.6803600192070007 }, { "epoch": 0.2344275262013051, "step": 2371, "train/sim_loss": 0.1484375 }, { "epoch": 0.2344275262013051, "step": 2371, "train/total_loss": 0.2164735049009323 }, { "entropy": 9.336524963378906, "epoch": 0.23452639905082065, "mean_token_accuracy": 0.7577388882637024, "num_tokens": 13037166.0, "step": 2372, "train/ce_loss": 0.47740182280540466 }, { "epoch": 0.23452639905082065, "step": 2372, "train/sim_loss": 0.125 }, { "epoch": 0.23452639905082065, "step": 2372, "train/total_loss": 0.17274019122123718 }, { "entropy": 9.330634117126465, "epoch": 0.23462527190033616, "mean_token_accuracy": 0.7799510955810547, "num_tokens": 13042557.0, "step": 2373, "train/ce_loss": 0.5779379606246948 }, { "epoch": 0.23462527190033616, "step": 2373, "train/sim_loss": 0.06640625 }, { "epoch": 0.23462527190033616, "step": 2373, "train/total_loss": 0.12420004606246948 }, { "entropy": 9.30705738067627, "epoch": 0.2347241447498517, "mean_token_accuracy": 0.766707181930542, "num_tokens": 13048039.0, "step": 2374, "train/ce_loss": 1.1184642314910889 }, { "epoch": 0.2347241447498517, "step": 2374, "train/sim_loss": 0.078125 }, { "epoch": 0.2347241447498517, "step": 2374, "train/total_loss": 0.1899714171886444 }, { "entropy": 9.204816818237305, "epoch": 0.23482301759936722, "mean_token_accuracy": 0.7089552283287048, "num_tokens": 13053509.0, "step": 2375, "train/ce_loss": 0.9980000257492065 }, { "epoch": 0.23482301759936722, "step": 2375, "train/sim_loss": 0.125 }, { "epoch": 0.23482301759936722, "step": 2375, "train/total_loss": 0.2248000055551529 }, { "entropy": 9.260689735412598, "epoch": 0.23492189044888273, "mean_token_accuracy": 0.7622548937797546, "num_tokens": 13058914.0, "step": 2376, "train/ce_loss": 0.9607309103012085 }, { "epoch": 0.23492189044888273, "step": 2376, "train/sim_loss": 0.03125 }, { "epoch": 0.23492189044888273, "step": 2376, "train/total_loss": 0.12732309103012085 }, { "entropy": 9.128045082092285, "epoch": 0.23502076329839827, "mean_token_accuracy": 0.7975903749465942, "num_tokens": 13064419.0, "step": 2377, "train/ce_loss": 0.3408890664577484 }, { "epoch": 0.23502076329839827, "step": 2377, "train/sim_loss": 0.0703125 }, { "epoch": 0.23502076329839827, "step": 2377, "train/total_loss": 0.10440140962600708 }, { "entropy": 9.041582107543945, "epoch": 0.23511963614791379, "mean_token_accuracy": 0.7599999904632568, "num_tokens": 13069940.0, "step": 2378, "train/ce_loss": 0.8313864469528198 }, { "epoch": 0.23511963614791379, "step": 2378, "train/sim_loss": 0.078125 }, { "epoch": 0.23511963614791379, "step": 2378, "train/total_loss": 0.16126364469528198 }, { "entropy": 9.028491973876953, "epoch": 0.2352185089974293, "mean_token_accuracy": 0.7293844223022461, "num_tokens": 13075433.0, "step": 2379, "train/ce_loss": 0.730046272277832 }, { "epoch": 0.2352185089974293, "step": 2379, "train/sim_loss": 0.03125 }, { "epoch": 0.2352185089974293, "step": 2379, "train/total_loss": 0.10425462573766708 }, { "epoch": 0.23531738184694484, "grad_norm": 0.8579027652740479, "learning_rate": 9.414280769420957e-06, "loss": 0.1492, "step": 2380 }, { "entropy": 9.39757251739502, "epoch": 0.23531738184694484, "mean_token_accuracy": 0.7540372610092163, "num_tokens": 13080833.0, "step": 2380, "train/ce_loss": 1.0110691785812378 }, { "epoch": 0.23531738184694484, "step": 2380, "train/sim_loss": 0.09375 }, { "epoch": 0.23531738184694484, "step": 2380, "train/total_loss": 0.1948569118976593 }, { "entropy": 9.199833869934082, "epoch": 0.23541625469646035, "mean_token_accuracy": 0.6831072568893433, "num_tokens": 13086475.0, "step": 2381, "train/ce_loss": 0.5288617014884949 }, { "epoch": 0.23541625469646035, "step": 2381, "train/sim_loss": 0.12109375 }, { "epoch": 0.23541625469646035, "step": 2381, "train/total_loss": 0.17397992312908173 }, { "entropy": 9.315118789672852, "epoch": 0.23551512754597587, "mean_token_accuracy": 0.7287933230400085, "num_tokens": 13091964.0, "step": 2382, "train/ce_loss": 0.8100540041923523 }, { "epoch": 0.23551512754597587, "step": 2382, "train/sim_loss": 0.1015625 }, { "epoch": 0.23551512754597587, "step": 2382, "train/total_loss": 0.18256789445877075 }, { "entropy": 9.345630645751953, "epoch": 0.2356140003954914, "mean_token_accuracy": 0.7071524858474731, "num_tokens": 13097383.0, "step": 2383, "train/ce_loss": 1.0763593912124634 }, { "epoch": 0.2356140003954914, "step": 2383, "train/sim_loss": 0.09375 }, { "epoch": 0.2356140003954914, "step": 2383, "train/total_loss": 0.20138594508171082 }, { "entropy": 9.315841674804688, "epoch": 0.23571287324500692, "mean_token_accuracy": 0.7963636517524719, "num_tokens": 13102810.0, "step": 2384, "train/ce_loss": 0.567698061466217 }, { "epoch": 0.23571287324500692, "step": 2384, "train/sim_loss": 0.0703125 }, { "epoch": 0.23571287324500692, "step": 2384, "train/total_loss": 0.12708230316638947 }, { "entropy": 8.84469985961914, "epoch": 0.23581174609452243, "mean_token_accuracy": 0.7653688788414001, "num_tokens": 13108432.0, "step": 2385, "train/ce_loss": 1.3935414552688599 }, { "epoch": 0.23581174609452243, "step": 2385, "train/sim_loss": 0.1015625 }, { "epoch": 0.23581174609452243, "step": 2385, "train/total_loss": 0.2409166544675827 }, { "entropy": 9.456504821777344, "epoch": 0.23591061894403798, "mean_token_accuracy": 0.7085714340209961, "num_tokens": 13113772.0, "step": 2386, "train/ce_loss": 0.6088796854019165 }, { "epoch": 0.23591061894403798, "step": 2386, "train/sim_loss": 0.0859375 }, { "epoch": 0.23591061894403798, "step": 2386, "train/total_loss": 0.14682546257972717 }, { "entropy": 8.890081405639648, "epoch": 0.2360094917935535, "mean_token_accuracy": 0.7119244337081909, "num_tokens": 13119210.0, "step": 2387, "train/ce_loss": 0.7644118666648865 }, { "epoch": 0.2360094917935535, "step": 2387, "train/sim_loss": 0.0546875 }, { "epoch": 0.2360094917935535, "step": 2387, "train/total_loss": 0.1311286985874176 }, { "entropy": 9.196063995361328, "epoch": 0.236108364643069, "mean_token_accuracy": 0.7289719581604004, "num_tokens": 13124691.0, "step": 2388, "train/ce_loss": 0.713472306728363 }, { "epoch": 0.236108364643069, "step": 2388, "train/sim_loss": 0.07421875 }, { "epoch": 0.236108364643069, "step": 2388, "train/total_loss": 0.14556598663330078 }, { "entropy": 9.123497009277344, "epoch": 0.23620723749258454, "mean_token_accuracy": 0.7217165231704712, "num_tokens": 13130144.0, "step": 2389, "train/ce_loss": 0.7694689631462097 }, { "epoch": 0.23620723749258454, "step": 2389, "train/sim_loss": 0.06640625 }, { "epoch": 0.23620723749258454, "step": 2389, "train/total_loss": 0.1433531492948532 }, { "entropy": 8.550424575805664, "epoch": 0.23630611034210006, "mean_token_accuracy": 0.7449799180030823, "num_tokens": 13135867.0, "step": 2390, "train/ce_loss": 0.8268230557441711 }, { "epoch": 0.23630611034210006, "step": 2390, "train/sim_loss": 0.1171875 }, { "epoch": 0.23630611034210006, "step": 2390, "train/total_loss": 0.1998698115348816 }, { "entropy": 9.46438217163086, "epoch": 0.23640498319161557, "mean_token_accuracy": 0.7844941020011902, "num_tokens": 13141258.0, "step": 2391, "train/ce_loss": 0.7515913844108582 }, { "epoch": 0.23640498319161557, "step": 2391, "train/sim_loss": 0.03125 }, { "epoch": 0.23640498319161557, "step": 2391, "train/total_loss": 0.10640913993120193 }, { "entropy": 9.0453462600708, "epoch": 0.2365038560411311, "mean_token_accuracy": 0.7363530993461609, "num_tokens": 13146765.0, "step": 2392, "train/ce_loss": 1.0351674556732178 }, { "epoch": 0.2365038560411311, "step": 2392, "train/sim_loss": 0.13671875 }, { "epoch": 0.2365038560411311, "step": 2392, "train/total_loss": 0.24023550748825073 }, { "entropy": 8.808025360107422, "epoch": 0.23660272889064662, "mean_token_accuracy": 0.7428315281867981, "num_tokens": 13152445.0, "step": 2393, "train/ce_loss": 0.5450968742370605 }, { "epoch": 0.23660272889064662, "step": 2393, "train/sim_loss": 0.05078125 }, { "epoch": 0.23660272889064662, "step": 2393, "train/total_loss": 0.10529093444347382 }, { "entropy": 9.007831573486328, "epoch": 0.23670160174016217, "mean_token_accuracy": 0.7061403393745422, "num_tokens": 13158007.0, "step": 2394, "train/ce_loss": 1.3234292268753052 }, { "epoch": 0.23670160174016217, "step": 2394, "train/sim_loss": 0.06640625 }, { "epoch": 0.23670160174016217, "step": 2394, "train/total_loss": 0.19874916970729828 }, { "entropy": 8.400471687316895, "epoch": 0.23680047458967768, "mean_token_accuracy": 0.740469217300415, "num_tokens": 13163999.0, "step": 2395, "train/ce_loss": 1.731566071510315 }, { "epoch": 0.23680047458967768, "step": 2395, "train/sim_loss": 0.0546875 }, { "epoch": 0.23680047458967768, "step": 2395, "train/total_loss": 0.22784410417079926 }, { "entropy": 9.167736053466797, "epoch": 0.2368993474391932, "mean_token_accuracy": 0.7456575632095337, "num_tokens": 13169412.0, "step": 2396, "train/ce_loss": 1.234921932220459 }, { "epoch": 0.2368993474391932, "step": 2396, "train/sim_loss": 0.08203125 }, { "epoch": 0.2368993474391932, "step": 2396, "train/total_loss": 0.20552344620227814 }, { "entropy": 9.339987754821777, "epoch": 0.23699822028870873, "mean_token_accuracy": 0.7397820353507996, "num_tokens": 13174770.0, "step": 2397, "train/ce_loss": 0.984082043170929 }, { "epoch": 0.23699822028870873, "step": 2397, "train/sim_loss": 0.04296875 }, { "epoch": 0.23699822028870873, "step": 2397, "train/total_loss": 0.14137695729732513 }, { "entropy": 9.247007369995117, "epoch": 0.23709709313822425, "mean_token_accuracy": 0.7450980544090271, "num_tokens": 13180256.0, "step": 2398, "train/ce_loss": 0.8422147035598755 }, { "epoch": 0.23709709313822425, "step": 2398, "train/sim_loss": 0.05859375 }, { "epoch": 0.23709709313822425, "step": 2398, "train/total_loss": 0.1428152322769165 }, { "entropy": 9.47435188293457, "epoch": 0.23719596598773976, "mean_token_accuracy": 0.7855113744735718, "num_tokens": 13185520.0, "step": 2399, "train/ce_loss": 0.5421736240386963 }, { "epoch": 0.23719596598773976, "step": 2399, "train/sim_loss": 0.07421875 }, { "epoch": 0.23719596598773976, "step": 2399, "train/total_loss": 0.1284361183643341 }, { "epoch": 0.2372948388372553, "grad_norm": 0.8350571990013123, "learning_rate": 9.409335904663008e-06, "loss": 0.1582, "step": 2400 }, { "entropy": 9.233587265014648, "epoch": 0.2372948388372553, "mean_token_accuracy": 0.7676056623458862, "num_tokens": 13190999.0, "step": 2400, "train/ce_loss": 0.35916566848754883 }, { "epoch": 0.2372948388372553, "step": 2400, "train/sim_loss": 0.03125 }, { "epoch": 0.2372948388372553, "step": 2400, "train/total_loss": 0.06716656684875488 }, { "entropy": 9.317353248596191, "epoch": 0.23739371168677081, "mean_token_accuracy": 0.7382199168205261, "num_tokens": 13196421.0, "step": 2401, "train/ce_loss": 0.6853013634681702 }, { "epoch": 0.23739371168677081, "step": 2401, "train/sim_loss": 0.04296875 }, { "epoch": 0.23739371168677081, "step": 2401, "train/total_loss": 0.1114988848567009 }, { "entropy": 8.825057029724121, "epoch": 0.23749258453628633, "mean_token_accuracy": 0.7572916746139526, "num_tokens": 13202061.0, "step": 2402, "train/ce_loss": 0.6657012701034546 }, { "epoch": 0.23749258453628633, "step": 2402, "train/sim_loss": 0.0859375 }, { "epoch": 0.23749258453628633, "step": 2402, "train/total_loss": 0.15250763297080994 }, { "entropy": 9.14011001586914, "epoch": 0.23759145738580187, "mean_token_accuracy": 0.7460136413574219, "num_tokens": 13207524.0, "step": 2403, "train/ce_loss": 0.8858813643455505 }, { "epoch": 0.23759145738580187, "step": 2403, "train/sim_loss": 0.0859375 }, { "epoch": 0.23759145738580187, "step": 2403, "train/total_loss": 0.174525648355484 }, { "entropy": 9.045696258544922, "epoch": 0.23769033023531738, "mean_token_accuracy": 0.7567567825317383, "num_tokens": 13213111.0, "step": 2404, "train/ce_loss": 0.886686384677887 }, { "epoch": 0.23769033023531738, "step": 2404, "train/sim_loss": 0.1171875 }, { "epoch": 0.23769033023531738, "step": 2404, "train/total_loss": 0.20585614442825317 }, { "entropy": 9.32132625579834, "epoch": 0.2377892030848329, "mean_token_accuracy": 0.7976190447807312, "num_tokens": 13218578.0, "step": 2405, "train/ce_loss": 0.34901610016822815 }, { "epoch": 0.2377892030848329, "step": 2405, "train/sim_loss": 0.1171875 }, { "epoch": 0.2377892030848329, "step": 2405, "train/total_loss": 0.15208911895751953 }, { "entropy": 8.991390228271484, "epoch": 0.23788807593434844, "mean_token_accuracy": 0.7378318309783936, "num_tokens": 13223986.0, "step": 2406, "train/ce_loss": 0.8726102709770203 }, { "epoch": 0.23788807593434844, "step": 2406, "train/sim_loss": 0.0703125 }, { "epoch": 0.23788807593434844, "step": 2406, "train/total_loss": 0.15757352113723755 }, { "entropy": 9.039144515991211, "epoch": 0.23798694878386395, "mean_token_accuracy": 0.7084673047065735, "num_tokens": 13229554.0, "step": 2407, "train/ce_loss": 0.8067007064819336 }, { "epoch": 0.23798694878386395, "step": 2407, "train/sim_loss": 0.03515625 }, { "epoch": 0.23798694878386395, "step": 2407, "train/total_loss": 0.1158263236284256 }, { "entropy": 9.522336959838867, "epoch": 0.23808582163337946, "mean_token_accuracy": 0.7223065495491028, "num_tokens": 13234824.0, "step": 2408, "train/ce_loss": 0.9403669834136963 }, { "epoch": 0.23808582163337946, "step": 2408, "train/sim_loss": 0.1015625 }, { "epoch": 0.23808582163337946, "step": 2408, "train/total_loss": 0.19559919834136963 }, { "entropy": 9.328763008117676, "epoch": 0.238184694482895, "mean_token_accuracy": 0.754054069519043, "num_tokens": 13240117.0, "step": 2409, "train/ce_loss": 0.9253959655761719 }, { "epoch": 0.238184694482895, "step": 2409, "train/sim_loss": 0.0625 }, { "epoch": 0.238184694482895, "step": 2409, "train/total_loss": 0.15503960847854614 }, { "entropy": 9.237675666809082, "epoch": 0.23828356733241052, "mean_token_accuracy": 0.7569974660873413, "num_tokens": 13245487.0, "step": 2410, "train/ce_loss": 1.0653560161590576 }, { "epoch": 0.23828356733241052, "step": 2410, "train/sim_loss": 0.0390625 }, { "epoch": 0.23828356733241052, "step": 2410, "train/total_loss": 0.14559811353683472 }, { "entropy": 9.215715408325195, "epoch": 0.23838244018192603, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 13250908.0, "step": 2411, "train/ce_loss": 0.9256007075309753 }, { "epoch": 0.23838244018192603, "step": 2411, "train/sim_loss": 0.078125 }, { "epoch": 0.23838244018192603, "step": 2411, "train/total_loss": 0.1706850826740265 }, { "entropy": 9.284351348876953, "epoch": 0.23848131303144157, "mean_token_accuracy": 0.714640200138092, "num_tokens": 13256272.0, "step": 2412, "train/ce_loss": 0.4408405125141144 }, { "epoch": 0.23848131303144157, "step": 2412, "train/sim_loss": 0.0625 }, { "epoch": 0.23848131303144157, "step": 2412, "train/total_loss": 0.10658405721187592 }, { "entropy": 9.416218757629395, "epoch": 0.23858018588095709, "mean_token_accuracy": 0.750617265701294, "num_tokens": 13261574.0, "step": 2413, "train/ce_loss": 0.7991043925285339 }, { "epoch": 0.23858018588095709, "step": 2413, "train/sim_loss": 0.08203125 }, { "epoch": 0.23858018588095709, "step": 2413, "train/total_loss": 0.16194169223308563 }, { "entropy": 9.343835830688477, "epoch": 0.2386790587304726, "mean_token_accuracy": 0.7517337203025818, "num_tokens": 13266957.0, "step": 2414, "train/ce_loss": 0.7459036111831665 }, { "epoch": 0.2386790587304726, "step": 2414, "train/sim_loss": 0.1171875 }, { "epoch": 0.2386790587304726, "step": 2414, "train/total_loss": 0.19177785515785217 }, { "entropy": 9.14630126953125, "epoch": 0.23877793157998814, "mean_token_accuracy": 0.7471839785575867, "num_tokens": 13272426.0, "step": 2415, "train/ce_loss": 0.4680548906326294 }, { "epoch": 0.23877793157998814, "step": 2415, "train/sim_loss": 0.06640625 }, { "epoch": 0.23877793157998814, "step": 2415, "train/total_loss": 0.1132117360830307 }, { "entropy": 8.972258567810059, "epoch": 0.23887680442950365, "mean_token_accuracy": 0.7316602468490601, "num_tokens": 13278041.0, "step": 2416, "train/ce_loss": 0.7407976984977722 }, { "epoch": 0.23887680442950365, "step": 2416, "train/sim_loss": 0.046875 }, { "epoch": 0.23887680442950365, "step": 2416, "train/total_loss": 0.12095477432012558 }, { "entropy": 9.346928596496582, "epoch": 0.2389756772790192, "mean_token_accuracy": 0.7288828492164612, "num_tokens": 13283538.0, "step": 2417, "train/ce_loss": 0.7531821727752686 }, { "epoch": 0.2389756772790192, "step": 2417, "train/sim_loss": 0.078125 }, { "epoch": 0.2389756772790192, "step": 2417, "train/total_loss": 0.15344321727752686 }, { "entropy": 8.993437767028809, "epoch": 0.2390745501285347, "mean_token_accuracy": 0.7387669682502747, "num_tokens": 13289126.0, "step": 2418, "train/ce_loss": 0.7935821413993835 }, { "epoch": 0.2390745501285347, "step": 2418, "train/sim_loss": 0.08203125 }, { "epoch": 0.2390745501285347, "step": 2418, "train/total_loss": 0.16138947010040283 }, { "entropy": 9.112070083618164, "epoch": 0.23917342297805022, "mean_token_accuracy": 0.7076166868209839, "num_tokens": 13294583.0, "step": 2419, "train/ce_loss": 0.7170619964599609 }, { "epoch": 0.23917342297805022, "step": 2419, "train/sim_loss": 0.0859375 }, { "epoch": 0.23917342297805022, "step": 2419, "train/total_loss": 0.15764370560646057 }, { "epoch": 0.23927229582756576, "grad_norm": 1.0738686323165894, "learning_rate": 9.40439103990506e-06, "loss": 0.149, "step": 2420 }, { "entropy": 9.295671463012695, "epoch": 0.23927229582756576, "mean_token_accuracy": 0.7611749768257141, "num_tokens": 13299946.0, "step": 2420, "train/ce_loss": 0.8244406580924988 }, { "epoch": 0.23927229582756576, "step": 2420, "train/sim_loss": 0.06640625 }, { "epoch": 0.23927229582756576, "step": 2420, "train/total_loss": 0.14885032176971436 }, { "entropy": 9.007556915283203, "epoch": 0.23937116867708128, "mean_token_accuracy": 0.7912946343421936, "num_tokens": 13305482.0, "step": 2421, "train/ce_loss": 0.743137776851654 }, { "epoch": 0.23937116867708128, "step": 2421, "train/sim_loss": 0.03515625 }, { "epoch": 0.23937116867708128, "step": 2421, "train/total_loss": 0.10947003215551376 }, { "entropy": 9.095251083374023, "epoch": 0.2394700415265968, "mean_token_accuracy": 0.6739130616188049, "num_tokens": 13310970.0, "step": 2422, "train/ce_loss": 0.49347683787345886 }, { "epoch": 0.2394700415265968, "step": 2422, "train/sim_loss": 0.0859375 }, { "epoch": 0.2394700415265968, "step": 2422, "train/total_loss": 0.1352851837873459 }, { "entropy": 9.084257125854492, "epoch": 0.23956891437611233, "mean_token_accuracy": 0.7297297120094299, "num_tokens": 13316371.0, "step": 2423, "train/ce_loss": 0.7282159924507141 }, { "epoch": 0.23956891437611233, "step": 2423, "train/sim_loss": 0.0625 }, { "epoch": 0.23956891437611233, "step": 2423, "train/total_loss": 0.13532160222530365 }, { "entropy": 9.336811065673828, "epoch": 0.23966778722562784, "mean_token_accuracy": 0.8125, "num_tokens": 13321684.0, "step": 2424, "train/ce_loss": 0.5357924103736877 }, { "epoch": 0.23966778722562784, "step": 2424, "train/sim_loss": 0.0234375 }, { "epoch": 0.23966778722562784, "step": 2424, "train/total_loss": 0.07701674103736877 }, { "entropy": 8.908781051635742, "epoch": 0.23976666007514336, "mean_token_accuracy": 0.8024691343307495, "num_tokens": 13326976.0, "step": 2425, "train/ce_loss": 0.46100014448165894 }, { "epoch": 0.23976666007514336, "step": 2425, "train/sim_loss": 0.0546875 }, { "epoch": 0.23976666007514336, "step": 2425, "train/total_loss": 0.10078752040863037 }, { "entropy": 9.271570205688477, "epoch": 0.2398655329246589, "mean_token_accuracy": 0.7516688704490662, "num_tokens": 13332369.0, "step": 2426, "train/ce_loss": 0.9985665082931519 }, { "epoch": 0.2398655329246589, "step": 2426, "train/sim_loss": 0.11328125 }, { "epoch": 0.2398655329246589, "step": 2426, "train/total_loss": 0.2131378948688507 }, { "entropy": 8.625679016113281, "epoch": 0.2399644057741744, "mean_token_accuracy": 0.7820512652397156, "num_tokens": 13338184.0, "step": 2427, "train/ce_loss": 0.446796715259552 }, { "epoch": 0.2399644057741744, "step": 2427, "train/sim_loss": 0.09375 }, { "epoch": 0.2399644057741744, "step": 2427, "train/total_loss": 0.1384296715259552 }, { "entropy": 8.797993659973145, "epoch": 0.24006327862368992, "mean_token_accuracy": 0.7446808218955994, "num_tokens": 13343652.0, "step": 2428, "train/ce_loss": 0.602220356464386 }, { "epoch": 0.24006327862368992, "step": 2428, "train/sim_loss": 0.05078125 }, { "epoch": 0.24006327862368992, "step": 2428, "train/total_loss": 0.11100328713655472 }, { "entropy": 8.845714569091797, "epoch": 0.24016215147320547, "mean_token_accuracy": 0.6728060841560364, "num_tokens": 13349145.0, "step": 2429, "train/ce_loss": 0.7394577860832214 }, { "epoch": 0.24016215147320547, "step": 2429, "train/sim_loss": 0.1015625 }, { "epoch": 0.24016215147320547, "step": 2429, "train/total_loss": 0.1755082905292511 }, { "entropy": 9.047611236572266, "epoch": 0.24026102432272098, "mean_token_accuracy": 0.6883628964424133, "num_tokens": 13354740.0, "step": 2430, "train/ce_loss": 0.7139731049537659 }, { "epoch": 0.24026102432272098, "step": 2430, "train/sim_loss": 0.08984375 }, { "epoch": 0.24026102432272098, "step": 2430, "train/total_loss": 0.1612410545349121 }, { "entropy": 9.371095657348633, "epoch": 0.2403598971722365, "mean_token_accuracy": 0.7284482717514038, "num_tokens": 13360098.0, "step": 2431, "train/ce_loss": 0.8484253883361816 }, { "epoch": 0.2403598971722365, "step": 2431, "train/sim_loss": 0.06640625 }, { "epoch": 0.2403598971722365, "step": 2431, "train/total_loss": 0.1512487828731537 }, { "entropy": 9.153241157531738, "epoch": 0.24045877002175203, "mean_token_accuracy": 0.7134292721748352, "num_tokens": 13365484.0, "step": 2432, "train/ce_loss": 0.6817359328269958 }, { "epoch": 0.24045877002175203, "step": 2432, "train/sim_loss": 0.0625 }, { "epoch": 0.24045877002175203, "step": 2432, "train/total_loss": 0.1306735873222351 }, { "entropy": 9.128822326660156, "epoch": 0.24055764287126755, "mean_token_accuracy": 0.7139534950256348, "num_tokens": 13370984.0, "step": 2433, "train/ce_loss": 0.9649145603179932 }, { "epoch": 0.24055764287126755, "step": 2433, "train/sim_loss": 0.0546875 }, { "epoch": 0.24055764287126755, "step": 2433, "train/total_loss": 0.15117895603179932 }, { "entropy": 9.136333465576172, "epoch": 0.24065651572078306, "mean_token_accuracy": 0.730215847492218, "num_tokens": 13376368.0, "step": 2434, "train/ce_loss": 1.4166347980499268 }, { "epoch": 0.24065651572078306, "step": 2434, "train/sim_loss": 0.05859375 }, { "epoch": 0.24065651572078306, "step": 2434, "train/total_loss": 0.20025722682476044 }, { "entropy": 8.925800323486328, "epoch": 0.2407553885702986, "mean_token_accuracy": 0.7456230521202087, "num_tokens": 13381937.0, "step": 2435, "train/ce_loss": 0.5971159338951111 }, { "epoch": 0.2407553885702986, "step": 2435, "train/sim_loss": 0.03125 }, { "epoch": 0.2407553885702986, "step": 2435, "train/total_loss": 0.09096159040927887 }, { "entropy": 9.144954681396484, "epoch": 0.24085426141981411, "mean_token_accuracy": 0.7546728849411011, "num_tokens": 13387453.0, "step": 2436, "train/ce_loss": 0.28382939100265503 }, { "epoch": 0.24085426141981411, "step": 2436, "train/sim_loss": 0.0234375 }, { "epoch": 0.24085426141981411, "step": 2436, "train/total_loss": 0.05182044208049774 }, { "entropy": 8.87916374206543, "epoch": 0.24095313426932966, "mean_token_accuracy": 0.8061716556549072, "num_tokens": 13393151.0, "step": 2437, "train/ce_loss": 0.5573335289955139 }, { "epoch": 0.24095313426932966, "step": 2437, "train/sim_loss": 0.0625 }, { "epoch": 0.24095313426932966, "step": 2437, "train/total_loss": 0.11823335289955139 }, { "entropy": 9.187578201293945, "epoch": 0.24105200711884517, "mean_token_accuracy": 0.7488095164299011, "num_tokens": 13398613.0, "step": 2438, "train/ce_loss": 0.8693035244941711 }, { "epoch": 0.24105200711884517, "step": 2438, "train/sim_loss": 0.05859375 }, { "epoch": 0.24105200711884517, "step": 2438, "train/total_loss": 0.14552411437034607 }, { "entropy": 8.758769989013672, "epoch": 0.24115087996836068, "mean_token_accuracy": 0.7787524461746216, "num_tokens": 13404306.0, "step": 2439, "train/ce_loss": 0.30098089575767517 }, { "epoch": 0.24115087996836068, "step": 2439, "train/sim_loss": 0.0234375 }, { "epoch": 0.24115087996836068, "step": 2439, "train/total_loss": 0.0535355880856514 }, { "epoch": 0.24124975281787622, "grad_norm": 0.8929702043533325, "learning_rate": 9.39944617514711e-06, "loss": 0.1513, "step": 2440 }, { "entropy": 8.895760536193848, "epoch": 0.24124975281787622, "mean_token_accuracy": 0.7474226951599121, "num_tokens": 13409891.0, "step": 2440, "train/ce_loss": 0.6410526633262634 }, { "epoch": 0.24124975281787622, "step": 2440, "train/sim_loss": 0.125 }, { "epoch": 0.24124975281787622, "step": 2440, "train/total_loss": 0.18910527229309082 }, { "entropy": 9.188408851623535, "epoch": 0.24134862566739174, "mean_token_accuracy": 0.697752833366394, "num_tokens": 13415404.0, "step": 2441, "train/ce_loss": 0.7837318778038025 }, { "epoch": 0.24134862566739174, "step": 2441, "train/sim_loss": 0.078125 }, { "epoch": 0.24134862566739174, "step": 2441, "train/total_loss": 0.15649819374084473 }, { "entropy": 9.409379959106445, "epoch": 0.24144749851690725, "mean_token_accuracy": 0.7386215925216675, "num_tokens": 13420741.0, "step": 2442, "train/ce_loss": 1.0217965841293335 }, { "epoch": 0.24144749851690725, "step": 2442, "train/sim_loss": 0.08203125 }, { "epoch": 0.24144749851690725, "step": 2442, "train/total_loss": 0.1842109113931656 }, { "entropy": 8.85597038269043, "epoch": 0.2415463713664228, "mean_token_accuracy": 0.774129331111908, "num_tokens": 13426343.0, "step": 2443, "train/ce_loss": 1.1573092937469482 }, { "epoch": 0.2415463713664228, "step": 2443, "train/sim_loss": 0.05078125 }, { "epoch": 0.2415463713664228, "step": 2443, "train/total_loss": 0.16651219129562378 }, { "entropy": 9.036104202270508, "epoch": 0.2416452442159383, "mean_token_accuracy": 0.7533414363861084, "num_tokens": 13431755.0, "step": 2444, "train/ce_loss": 0.5979483127593994 }, { "epoch": 0.2416452442159383, "step": 2444, "train/sim_loss": 0.07421875 }, { "epoch": 0.2416452442159383, "step": 2444, "train/total_loss": 0.1340135782957077 }, { "entropy": 9.149088859558105, "epoch": 0.24174411706545382, "mean_token_accuracy": 0.759358286857605, "num_tokens": 13437056.0, "step": 2445, "train/ce_loss": 0.9974294900894165 }, { "epoch": 0.24174411706545382, "step": 2445, "train/sim_loss": 0.07421875 }, { "epoch": 0.24174411706545382, "step": 2445, "train/total_loss": 0.17396169900894165 }, { "entropy": 9.067731857299805, "epoch": 0.24184298991496936, "mean_token_accuracy": 0.7583892345428467, "num_tokens": 13442763.0, "step": 2446, "train/ce_loss": 0.8027241230010986 }, { "epoch": 0.24184298991496936, "step": 2446, "train/sim_loss": 0.109375 }, { "epoch": 0.24184298991496936, "step": 2446, "train/total_loss": 0.18964740633964539 }, { "entropy": 8.957456588745117, "epoch": 0.24194186276448487, "mean_token_accuracy": 0.7464480996131897, "num_tokens": 13448370.0, "step": 2447, "train/ce_loss": 0.5465542078018188 }, { "epoch": 0.24194186276448487, "step": 2447, "train/sim_loss": 0.1484375 }, { "epoch": 0.24194186276448487, "step": 2447, "train/total_loss": 0.20309291779994965 }, { "entropy": 9.134754180908203, "epoch": 0.24204073561400039, "mean_token_accuracy": 0.6640726327896118, "num_tokens": 13453789.0, "step": 2448, "train/ce_loss": 1.0815500020980835 }, { "epoch": 0.24204073561400039, "step": 2448, "train/sim_loss": 0.1484375 }, { "epoch": 0.24204073561400039, "step": 2448, "train/total_loss": 0.2565925121307373 }, { "entropy": 9.26327896118164, "epoch": 0.24213960846351593, "mean_token_accuracy": 0.748110830783844, "num_tokens": 13459187.0, "step": 2449, "train/ce_loss": 0.4673882722854614 }, { "epoch": 0.24213960846351593, "step": 2449, "train/sim_loss": 0.02734375 }, { "epoch": 0.24213960846351593, "step": 2449, "train/total_loss": 0.07408258318901062 }, { "entropy": 8.796772956848145, "epoch": 0.24223848131303144, "mean_token_accuracy": 0.7439180612564087, "num_tokens": 13464607.0, "step": 2450, "train/ce_loss": 0.5344161987304688 }, { "epoch": 0.24223848131303144, "step": 2450, "train/sim_loss": 0.0703125 }, { "epoch": 0.24223848131303144, "step": 2450, "train/total_loss": 0.123754121363163 }, { "entropy": 8.712947845458984, "epoch": 0.24233735416254695, "mean_token_accuracy": 0.7016769647598267, "num_tokens": 13470333.0, "step": 2451, "train/ce_loss": 0.7002038955688477 }, { "epoch": 0.24233735416254695, "step": 2451, "train/sim_loss": 0.05078125 }, { "epoch": 0.24233735416254695, "step": 2451, "train/total_loss": 0.120801642537117 }, { "entropy": 9.160923957824707, "epoch": 0.2424362270120625, "mean_token_accuracy": 0.7586604952812195, "num_tokens": 13475718.0, "step": 2452, "train/ce_loss": 0.5902289748191833 }, { "epoch": 0.2424362270120625, "step": 2452, "train/sim_loss": 0.02734375 }, { "epoch": 0.2424362270120625, "step": 2452, "train/total_loss": 0.08636665344238281 }, { "entropy": 9.128510475158691, "epoch": 0.242535099861578, "mean_token_accuracy": 0.8058252334594727, "num_tokens": 13481433.0, "step": 2453, "train/ce_loss": 0.8937333226203918 }, { "epoch": 0.242535099861578, "step": 2453, "train/sim_loss": 0.02734375 }, { "epoch": 0.242535099861578, "step": 2453, "train/total_loss": 0.11671708524227142 }, { "entropy": 9.074634552001953, "epoch": 0.24263397271109352, "mean_token_accuracy": 0.7311828136444092, "num_tokens": 13486780.0, "step": 2454, "train/ce_loss": 1.0734920501708984 }, { "epoch": 0.24263397271109352, "step": 2454, "train/sim_loss": 0.09765625 }, { "epoch": 0.24263397271109352, "step": 2454, "train/total_loss": 0.2050054669380188 }, { "entropy": 8.605195999145508, "epoch": 0.24273284556060906, "mean_token_accuracy": 0.6930612325668335, "num_tokens": 13492636.0, "step": 2455, "train/ce_loss": 0.726776123046875 }, { "epoch": 0.24273284556060906, "step": 2455, "train/sim_loss": 0.109375 }, { "epoch": 0.24273284556060906, "step": 2455, "train/total_loss": 0.1820526123046875 }, { "entropy": 9.372611999511719, "epoch": 0.24283171841012458, "mean_token_accuracy": 0.7102689743041992, "num_tokens": 13498036.0, "step": 2456, "train/ce_loss": 0.6559374928474426 }, { "epoch": 0.24283171841012458, "step": 2456, "train/sim_loss": 0.02734375 }, { "epoch": 0.24283171841012458, "step": 2456, "train/total_loss": 0.09293749928474426 }, { "entropy": 8.373779296875, "epoch": 0.24293059125964012, "mean_token_accuracy": 0.7106963396072388, "num_tokens": 13504052.0, "step": 2457, "train/ce_loss": 0.5823719501495361 }, { "epoch": 0.24293059125964012, "step": 2457, "train/sim_loss": 0.1015625 }, { "epoch": 0.24293059125964012, "step": 2457, "train/total_loss": 0.1597996950149536 }, { "entropy": 8.686786651611328, "epoch": 0.24302946410915563, "mean_token_accuracy": 0.7278845906257629, "num_tokens": 13509761.0, "step": 2458, "train/ce_loss": 0.3970146179199219 }, { "epoch": 0.24302946410915563, "step": 2458, "train/sim_loss": 0.0234375 }, { "epoch": 0.24302946410915563, "step": 2458, "train/total_loss": 0.06313896179199219 }, { "entropy": 8.785530090332031, "epoch": 0.24312833695867114, "mean_token_accuracy": 0.735551655292511, "num_tokens": 13515562.0, "step": 2459, "train/ce_loss": 0.5538773536682129 }, { "epoch": 0.24312833695867114, "step": 2459, "train/sim_loss": 0.06640625 }, { "epoch": 0.24312833695867114, "step": 2459, "train/total_loss": 0.12179398536682129 }, { "epoch": 0.24322720980818668, "grad_norm": 0.9254945516586304, "learning_rate": 9.394501310389161e-06, "loss": 0.1576, "step": 2460 }, { "entropy": 9.174077033996582, "epoch": 0.24322720980818668, "mean_token_accuracy": 0.7546699643135071, "num_tokens": 13520929.0, "step": 2460, "train/ce_loss": 0.7704933881759644 }, { "epoch": 0.24322720980818668, "step": 2460, "train/sim_loss": 0.0390625 }, { "epoch": 0.24322720980818668, "step": 2460, "train/total_loss": 0.11611183732748032 }, { "entropy": 9.526357650756836, "epoch": 0.2433260826577022, "mean_token_accuracy": 0.7212121486663818, "num_tokens": 13526190.0, "step": 2461, "train/ce_loss": 0.5180487632751465 }, { "epoch": 0.2433260826577022, "step": 2461, "train/sim_loss": 0.06640625 }, { "epoch": 0.2433260826577022, "step": 2461, "train/total_loss": 0.11821112781763077 }, { "entropy": 9.055994033813477, "epoch": 0.2434249555072177, "mean_token_accuracy": 0.7296996712684631, "num_tokens": 13531731.0, "step": 2462, "train/ce_loss": 0.8302079439163208 }, { "epoch": 0.2434249555072177, "step": 2462, "train/sim_loss": 0.1171875 }, { "epoch": 0.2434249555072177, "step": 2462, "train/total_loss": 0.20020830631256104 }, { "entropy": 9.006471633911133, "epoch": 0.24352382835673325, "mean_token_accuracy": 0.7874864935874939, "num_tokens": 13537311.0, "step": 2463, "train/ce_loss": 0.5222581624984741 }, { "epoch": 0.24352382835673325, "step": 2463, "train/sim_loss": 0.09375 }, { "epoch": 0.24352382835673325, "step": 2463, "train/total_loss": 0.14597581326961517 }, { "entropy": 8.874974250793457, "epoch": 0.24362270120624877, "mean_token_accuracy": 0.7198731303215027, "num_tokens": 13542870.0, "step": 2464, "train/ce_loss": 2.1973536014556885 }, { "epoch": 0.24362270120624877, "step": 2464, "train/sim_loss": 0.125 }, { "epoch": 0.24362270120624877, "step": 2464, "train/total_loss": 0.34473538398742676 }, { "entropy": 9.12371826171875, "epoch": 0.24372157405576428, "mean_token_accuracy": 0.7893518805503845, "num_tokens": 13548391.0, "step": 2465, "train/ce_loss": 0.7645032405853271 }, { "epoch": 0.24372157405576428, "step": 2465, "train/sim_loss": 0.09375 }, { "epoch": 0.24372157405576428, "step": 2465, "train/total_loss": 0.17020031809806824 }, { "entropy": 9.154958724975586, "epoch": 0.24382044690527982, "mean_token_accuracy": 0.7654321193695068, "num_tokens": 13553987.0, "step": 2466, "train/ce_loss": 0.6574394106864929 }, { "epoch": 0.24382044690527982, "step": 2466, "train/sim_loss": 0.0546875 }, { "epoch": 0.24382044690527982, "step": 2466, "train/total_loss": 0.12043144553899765 }, { "entropy": 9.228120803833008, "epoch": 0.24391931975479533, "mean_token_accuracy": 0.8121212124824524, "num_tokens": 13559359.0, "step": 2467, "train/ce_loss": 0.7715634107589722 }, { "epoch": 0.24391931975479533, "step": 2467, "train/sim_loss": 0.05078125 }, { "epoch": 0.24391931975479533, "step": 2467, "train/total_loss": 0.12793758511543274 }, { "entropy": 9.243097305297852, "epoch": 0.24401819260431085, "mean_token_accuracy": 0.778181791305542, "num_tokens": 13564853.0, "step": 2468, "train/ce_loss": 0.7529692053794861 }, { "epoch": 0.24401819260431085, "step": 2468, "train/sim_loss": 0.10546875 }, { "epoch": 0.24401819260431085, "step": 2468, "train/total_loss": 0.18076567351818085 }, { "entropy": 9.00951099395752, "epoch": 0.2441170654538264, "mean_token_accuracy": 0.7420118451118469, "num_tokens": 13570335.0, "step": 2469, "train/ce_loss": 0.9796106219291687 }, { "epoch": 0.2441170654538264, "step": 2469, "train/sim_loss": 0.04296875 }, { "epoch": 0.2441170654538264, "step": 2469, "train/total_loss": 0.14092981815338135 }, { "entropy": 8.968740463256836, "epoch": 0.2442159383033419, "mean_token_accuracy": 0.7602552175521851, "num_tokens": 13575950.0, "step": 2470, "train/ce_loss": 0.2519640028476715 }, { "epoch": 0.2442159383033419, "step": 2470, "train/sim_loss": 0.06640625 }, { "epoch": 0.2442159383033419, "step": 2470, "train/total_loss": 0.09160265326499939 }, { "entropy": 9.279936790466309, "epoch": 0.24431481115285741, "mean_token_accuracy": 0.7032085657119751, "num_tokens": 13581339.0, "step": 2471, "train/ce_loss": 0.5962768197059631 }, { "epoch": 0.24431481115285741, "step": 2471, "train/sim_loss": 0.046875 }, { "epoch": 0.24431481115285741, "step": 2471, "train/total_loss": 0.10650268197059631 }, { "entropy": 8.981837272644043, "epoch": 0.24441368400237296, "mean_token_accuracy": 0.7665975093841553, "num_tokens": 13586950.0, "step": 2472, "train/ce_loss": 0.7139501571655273 }, { "epoch": 0.24441368400237296, "step": 2472, "train/sim_loss": 0.10546875 }, { "epoch": 0.24441368400237296, "step": 2472, "train/total_loss": 0.17686375975608826 }, { "entropy": 9.251829147338867, "epoch": 0.24451255685188847, "mean_token_accuracy": 0.7187879085540771, "num_tokens": 13592408.0, "step": 2473, "train/ce_loss": 0.5313054919242859 }, { "epoch": 0.24451255685188847, "step": 2473, "train/sim_loss": 0.03125 }, { "epoch": 0.24451255685188847, "step": 2473, "train/total_loss": 0.08438055217266083 }, { "entropy": 9.07516860961914, "epoch": 0.24461142970140398, "mean_token_accuracy": 0.7513812184333801, "num_tokens": 13597933.0, "step": 2474, "train/ce_loss": 0.3839261829853058 }, { "epoch": 0.24461142970140398, "step": 2474, "train/sim_loss": 0.11328125 }, { "epoch": 0.24461142970140398, "step": 2474, "train/total_loss": 0.15167386829853058 }, { "entropy": 9.40049934387207, "epoch": 0.24471030255091952, "mean_token_accuracy": 0.7536618113517761, "num_tokens": 13603259.0, "step": 2475, "train/ce_loss": 1.1855719089508057 }, { "epoch": 0.24471030255091952, "step": 2475, "train/sim_loss": 0.05859375 }, { "epoch": 0.24471030255091952, "step": 2475, "train/total_loss": 0.1771509349346161 }, { "entropy": 8.842683792114258, "epoch": 0.24480917540043504, "mean_token_accuracy": 0.778593897819519, "num_tokens": 13608846.0, "step": 2476, "train/ce_loss": 0.46072712540626526 }, { "epoch": 0.24480917540043504, "step": 2476, "train/sim_loss": 0.0546875 }, { "epoch": 0.24480917540043504, "step": 2476, "train/total_loss": 0.10076021403074265 }, { "entropy": 9.210159301757812, "epoch": 0.24490804824995058, "mean_token_accuracy": 0.7304785847663879, "num_tokens": 13614200.0, "step": 2477, "train/ce_loss": 0.6691291928291321 }, { "epoch": 0.24490804824995058, "step": 2477, "train/sim_loss": 0.0546875 }, { "epoch": 0.24490804824995058, "step": 2477, "train/total_loss": 0.12160041928291321 }, { "entropy": 9.0802001953125, "epoch": 0.2450069210994661, "mean_token_accuracy": 0.7249114513397217, "num_tokens": 13619628.0, "step": 2478, "train/ce_loss": 0.6334500908851624 }, { "epoch": 0.2450069210994661, "step": 2478, "train/sim_loss": 0.03125 }, { "epoch": 0.2450069210994661, "step": 2478, "train/total_loss": 0.09459500759840012 }, { "entropy": 9.068818092346191, "epoch": 0.2451057939489816, "mean_token_accuracy": 0.7075688242912292, "num_tokens": 13625098.0, "step": 2479, "train/ce_loss": 0.7046844959259033 }, { "epoch": 0.2451057939489816, "step": 2479, "train/sim_loss": 0.0703125 }, { "epoch": 0.2451057939489816, "step": 2479, "train/total_loss": 0.1407809555530548 }, { "epoch": 0.24520466679849715, "grad_norm": 0.8835862278938293, "learning_rate": 9.389556445631213e-06, "loss": 0.1437, "step": 2480 }, { "entropy": 9.18278694152832, "epoch": 0.24520466679849715, "mean_token_accuracy": 0.772506058216095, "num_tokens": 13630585.0, "step": 2480, "train/ce_loss": 0.7813251614570618 }, { "epoch": 0.24520466679849715, "step": 2480, "train/sim_loss": 0.0234375 }, { "epoch": 0.24520466679849715, "step": 2480, "train/total_loss": 0.1015700176358223 }, { "entropy": 9.37770938873291, "epoch": 0.24530353964801266, "mean_token_accuracy": 0.7617765665054321, "num_tokens": 13635892.0, "step": 2481, "train/ce_loss": 0.5197727680206299 }, { "epoch": 0.24530353964801266, "step": 2481, "train/sim_loss": 0.0703125 }, { "epoch": 0.24530353964801266, "step": 2481, "train/total_loss": 0.12228977680206299 }, { "entropy": 8.894172668457031, "epoch": 0.24540241249752817, "mean_token_accuracy": 0.7487828731536865, "num_tokens": 13641523.0, "step": 2482, "train/ce_loss": 0.7538101077079773 }, { "epoch": 0.24540241249752817, "step": 2482, "train/sim_loss": 0.16796875 }, { "epoch": 0.24540241249752817, "step": 2482, "train/total_loss": 0.24334976077079773 }, { "entropy": 9.490640640258789, "epoch": 0.2455012853470437, "mean_token_accuracy": 0.725806474685669, "num_tokens": 13646981.0, "step": 2483, "train/ce_loss": 0.9295834898948669 }, { "epoch": 0.2455012853470437, "step": 2483, "train/sim_loss": 0.12890625 }, { "epoch": 0.2455012853470437, "step": 2483, "train/total_loss": 0.22186461091041565 }, { "entropy": 9.176568984985352, "epoch": 0.24560015819655923, "mean_token_accuracy": 0.7146226167678833, "num_tokens": 13652419.0, "step": 2484, "train/ce_loss": 0.9519146084785461 }, { "epoch": 0.24560015819655923, "step": 2484, "train/sim_loss": 0.0703125 }, { "epoch": 0.24560015819655923, "step": 2484, "train/total_loss": 0.16550396382808685 }, { "entropy": 8.34402847290039, "epoch": 0.24569903104607474, "mean_token_accuracy": 0.7546138167381287, "num_tokens": 13658511.0, "step": 2485, "train/ce_loss": 0.7093561887741089 }, { "epoch": 0.24569903104607474, "step": 2485, "train/sim_loss": 0.08203125 }, { "epoch": 0.24569903104607474, "step": 2485, "train/total_loss": 0.15296687185764313 }, { "entropy": 9.326410293579102, "epoch": 0.24579790389559028, "mean_token_accuracy": 0.7877907156944275, "num_tokens": 13663832.0, "step": 2486, "train/ce_loss": 0.48955172300338745 }, { "epoch": 0.24579790389559028, "step": 2486, "train/sim_loss": 0.05078125 }, { "epoch": 0.24579790389559028, "step": 2486, "train/total_loss": 0.09973642230033875 }, { "entropy": 8.968243598937988, "epoch": 0.2458967767451058, "mean_token_accuracy": 0.7440811991691589, "num_tokens": 13669297.0, "step": 2487, "train/ce_loss": 0.9138596057891846 }, { "epoch": 0.2458967767451058, "step": 2487, "train/sim_loss": 0.125 }, { "epoch": 0.2458967767451058, "step": 2487, "train/total_loss": 0.21638596057891846 }, { "entropy": 8.371915817260742, "epoch": 0.2459956495946213, "mean_token_accuracy": 0.7297710180282593, "num_tokens": 13675185.0, "step": 2488, "train/ce_loss": 0.5449524521827698 }, { "epoch": 0.2459956495946213, "step": 2488, "train/sim_loss": 0.03515625 }, { "epoch": 0.2459956495946213, "step": 2488, "train/total_loss": 0.08965149521827698 }, { "entropy": 9.055373191833496, "epoch": 0.24609452244413685, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 13680733.0, "step": 2489, "train/ce_loss": 1.1023415327072144 }, { "epoch": 0.24609452244413685, "step": 2489, "train/sim_loss": 0.07421875 }, { "epoch": 0.24609452244413685, "step": 2489, "train/total_loss": 0.18445290625095367 }, { "entropy": 9.370217323303223, "epoch": 0.24619339529365236, "mean_token_accuracy": 0.7522658705711365, "num_tokens": 13685960.0, "step": 2490, "train/ce_loss": 0.7881582379341125 }, { "epoch": 0.24619339529365236, "step": 2490, "train/sim_loss": 0.078125 }, { "epoch": 0.24619339529365236, "step": 2490, "train/total_loss": 0.15694081783294678 }, { "entropy": 9.578939437866211, "epoch": 0.24629226814316788, "mean_token_accuracy": 0.7621145248413086, "num_tokens": 13691252.0, "step": 2491, "train/ce_loss": 1.1800836324691772 }, { "epoch": 0.24629226814316788, "step": 2491, "train/sim_loss": 0.07421875 }, { "epoch": 0.24629226814316788, "step": 2491, "train/total_loss": 0.19222712516784668 }, { "entropy": 9.027331352233887, "epoch": 0.24639114099268342, "mean_token_accuracy": 0.7894179821014404, "num_tokens": 13696851.0, "step": 2492, "train/ce_loss": 0.7603301405906677 }, { "epoch": 0.24639114099268342, "step": 2492, "train/sim_loss": 0.08203125 }, { "epoch": 0.24639114099268342, "step": 2492, "train/total_loss": 0.15806427597999573 }, { "entropy": 9.142775535583496, "epoch": 0.24649001384219893, "mean_token_accuracy": 0.7147650718688965, "num_tokens": 13702315.0, "step": 2493, "train/ce_loss": 0.5765286684036255 }, { "epoch": 0.24649001384219893, "step": 2493, "train/sim_loss": 0.05859375 }, { "epoch": 0.24649001384219893, "step": 2493, "train/total_loss": 0.11624661833047867 }, { "entropy": 9.419833183288574, "epoch": 0.24658888669171444, "mean_token_accuracy": 0.729613721370697, "num_tokens": 13707615.0, "step": 2494, "train/ce_loss": 1.2659474611282349 }, { "epoch": 0.24658888669171444, "step": 2494, "train/sim_loss": 0.109375 }, { "epoch": 0.24658888669171444, "step": 2494, "train/total_loss": 0.23596975207328796 }, { "entropy": 9.37590503692627, "epoch": 0.24668775954122998, "mean_token_accuracy": 0.7621023654937744, "num_tokens": 13712902.0, "step": 2495, "train/ce_loss": 0.5639661550521851 }, { "epoch": 0.24668775954122998, "step": 2495, "train/sim_loss": 0.02734375 }, { "epoch": 0.24668775954122998, "step": 2495, "train/total_loss": 0.08374036848545074 }, { "entropy": 8.859838485717773, "epoch": 0.2467866323907455, "mean_token_accuracy": 0.7232510447502136, "num_tokens": 13718517.0, "step": 2496, "train/ce_loss": 0.5507077574729919 }, { "epoch": 0.2467866323907455, "step": 2496, "train/sim_loss": 0.04296875 }, { "epoch": 0.2467866323907455, "step": 2496, "train/total_loss": 0.09803952276706696 }, { "entropy": 9.236291885375977, "epoch": 0.246885505240261, "mean_token_accuracy": 0.795918345451355, "num_tokens": 13723904.0, "step": 2497, "train/ce_loss": 0.6185545921325684 }, { "epoch": 0.246885505240261, "step": 2497, "train/sim_loss": 0.05859375 }, { "epoch": 0.246885505240261, "step": 2497, "train/total_loss": 0.12044921517372131 }, { "entropy": 9.003756523132324, "epoch": 0.24698437808977655, "mean_token_accuracy": 0.7020023465156555, "num_tokens": 13729424.0, "step": 2498, "train/ce_loss": 0.5649727582931519 }, { "epoch": 0.24698437808977655, "step": 2498, "train/sim_loss": 0.09375 }, { "epoch": 0.24698437808977655, "step": 2498, "train/total_loss": 0.15024727582931519 }, { "entropy": 8.852783203125, "epoch": 0.24708325093929207, "mean_token_accuracy": 0.7598684430122375, "num_tokens": 13734975.0, "step": 2499, "train/ce_loss": 0.7350491285324097 }, { "epoch": 0.24708325093929207, "step": 2499, "train/sim_loss": 0.0546875 }, { "epoch": 0.24708325093929207, "step": 2499, "train/total_loss": 0.12819242477416992 }, { "epoch": 0.2471821237888076, "grad_norm": 0.7564249634742737, "learning_rate": 9.384611580873264e-06, "loss": 0.151, "step": 2500 }, { "entropy": 8.758055686950684, "epoch": 0.2471821237888076, "mean_token_accuracy": 0.7636544108390808, "num_tokens": 13740605.0, "step": 2500, "train/ce_loss": 0.9523006081581116 }, { "epoch": 0.2471821237888076, "step": 2500, "train/sim_loss": 0.0625 }, { "epoch": 0.2471821237888076, "step": 2500, "train/total_loss": 0.1577300727367401 }, { "entropy": 9.08548355102539, "epoch": 0.24728099663832312, "mean_token_accuracy": 0.7125827670097351, "num_tokens": 13746031.0, "step": 2501, "train/ce_loss": 0.8317070007324219 }, { "epoch": 0.24728099663832312, "step": 2501, "train/sim_loss": 0.04296875 }, { "epoch": 0.24728099663832312, "step": 2501, "train/total_loss": 0.12613946199417114 }, { "entropy": 9.090299606323242, "epoch": 0.24737986948783863, "mean_token_accuracy": 0.7317675948143005, "num_tokens": 13751477.0, "step": 2502, "train/ce_loss": 0.6353463530540466 }, { "epoch": 0.24737986948783863, "step": 2502, "train/sim_loss": 0.08203125 }, { "epoch": 0.24737986948783863, "step": 2502, "train/total_loss": 0.14556589722633362 }, { "entropy": 9.029054641723633, "epoch": 0.24747874233735417, "mean_token_accuracy": 0.7066666483879089, "num_tokens": 13757022.0, "step": 2503, "train/ce_loss": 0.6523033976554871 }, { "epoch": 0.24747874233735417, "step": 2503, "train/sim_loss": 0.046875 }, { "epoch": 0.24747874233735417, "step": 2503, "train/total_loss": 0.1121053397655487 }, { "entropy": 9.026205062866211, "epoch": 0.2475776151868697, "mean_token_accuracy": 0.7234762907028198, "num_tokens": 13762490.0, "step": 2504, "train/ce_loss": 0.5986970067024231 }, { "epoch": 0.2475776151868697, "step": 2504, "train/sim_loss": 0.0390625 }, { "epoch": 0.2475776151868697, "step": 2504, "train/total_loss": 0.09893220663070679 }, { "entropy": 9.255341529846191, "epoch": 0.2476764880363852, "mean_token_accuracy": 0.7549260854721069, "num_tokens": 13767903.0, "step": 2505, "train/ce_loss": 0.9454443454742432 }, { "epoch": 0.2476764880363852, "step": 2505, "train/sim_loss": 0.08984375 }, { "epoch": 0.2476764880363852, "step": 2505, "train/total_loss": 0.1843881905078888 }, { "entropy": 9.120267868041992, "epoch": 0.24777536088590074, "mean_token_accuracy": 0.7137930989265442, "num_tokens": 13773289.0, "step": 2506, "train/ce_loss": 1.1808849573135376 }, { "epoch": 0.24777536088590074, "step": 2506, "train/sim_loss": 0.046875 }, { "epoch": 0.24777536088590074, "step": 2506, "train/total_loss": 0.164963498711586 }, { "entropy": 9.123156547546387, "epoch": 0.24787423373541626, "mean_token_accuracy": 0.7811059951782227, "num_tokens": 13778738.0, "step": 2507, "train/ce_loss": 0.45552393794059753 }, { "epoch": 0.24787423373541626, "step": 2507, "train/sim_loss": 0.046875 }, { "epoch": 0.24787423373541626, "step": 2507, "train/total_loss": 0.09242739528417587 }, { "entropy": 9.400798797607422, "epoch": 0.24797310658493177, "mean_token_accuracy": 0.788170576095581, "num_tokens": 13784001.0, "step": 2508, "train/ce_loss": 0.40246686339378357 }, { "epoch": 0.24797310658493177, "step": 2508, "train/sim_loss": 0.0625 }, { "epoch": 0.24797310658493177, "step": 2508, "train/total_loss": 0.10274668782949448 }, { "entropy": 9.123509407043457, "epoch": 0.2480719794344473, "mean_token_accuracy": 0.7144362330436707, "num_tokens": 13789464.0, "step": 2509, "train/ce_loss": 1.0028389692306519 }, { "epoch": 0.2480719794344473, "step": 2509, "train/sim_loss": 0.07421875 }, { "epoch": 0.2480719794344473, "step": 2509, "train/total_loss": 0.1745026409626007 }, { "entropy": 9.118659973144531, "epoch": 0.24817085228396282, "mean_token_accuracy": 0.7276166677474976, "num_tokens": 13794901.0, "step": 2510, "train/ce_loss": 0.981614351272583 }, { "epoch": 0.24817085228396282, "step": 2510, "train/sim_loss": 0.05078125 }, { "epoch": 0.24817085228396282, "step": 2510, "train/total_loss": 0.14894267916679382 }, { "entropy": 9.14001178741455, "epoch": 0.24826972513347834, "mean_token_accuracy": 0.7468926310539246, "num_tokens": 13800338.0, "step": 2511, "train/ce_loss": 0.4110572934150696 }, { "epoch": 0.24826972513347834, "step": 2511, "train/sim_loss": 0.06640625 }, { "epoch": 0.24826972513347834, "step": 2511, "train/total_loss": 0.1075119823217392 }, { "entropy": 9.193622589111328, "epoch": 0.24836859798299388, "mean_token_accuracy": 0.7862069010734558, "num_tokens": 13805861.0, "step": 2512, "train/ce_loss": 0.5864476561546326 }, { "epoch": 0.24836859798299388, "step": 2512, "train/sim_loss": 0.05078125 }, { "epoch": 0.24836859798299388, "step": 2512, "train/total_loss": 0.10942602157592773 }, { "entropy": 9.167329788208008, "epoch": 0.2484674708325094, "mean_token_accuracy": 0.7405475974082947, "num_tokens": 13811221.0, "step": 2513, "train/ce_loss": 0.6400745511054993 }, { "epoch": 0.2484674708325094, "step": 2513, "train/sim_loss": 0.08203125 }, { "epoch": 0.2484674708325094, "step": 2513, "train/total_loss": 0.1460387110710144 }, { "entropy": 9.113824844360352, "epoch": 0.2485663436820249, "mean_token_accuracy": 0.7808535099029541, "num_tokens": 13816719.0, "step": 2514, "train/ce_loss": 0.6661072969436646 }, { "epoch": 0.2485663436820249, "step": 2514, "train/sim_loss": 0.06640625 }, { "epoch": 0.2485663436820249, "step": 2514, "train/total_loss": 0.13301697373390198 }, { "entropy": 8.997267723083496, "epoch": 0.24866521653154045, "mean_token_accuracy": 0.7802631855010986, "num_tokens": 13822100.0, "step": 2515, "train/ce_loss": 0.6380594968795776 }, { "epoch": 0.24866521653154045, "step": 2515, "train/sim_loss": 0.09375 }, { "epoch": 0.24866521653154045, "step": 2515, "train/total_loss": 0.15755595266819 }, { "entropy": 9.064447402954102, "epoch": 0.24876408938105596, "mean_token_accuracy": 0.805902361869812, "num_tokens": 13827556.0, "step": 2516, "train/ce_loss": 0.5448733568191528 }, { "epoch": 0.24876408938105596, "step": 2516, "train/sim_loss": 0.0234375 }, { "epoch": 0.24876408938105596, "step": 2516, "train/total_loss": 0.07792483270168304 }, { "entropy": 8.878552436828613, "epoch": 0.24886296223057147, "mean_token_accuracy": 0.7020202279090881, "num_tokens": 13833183.0, "step": 2517, "train/ce_loss": 1.1914528608322144 }, { "epoch": 0.24886296223057147, "step": 2517, "train/sim_loss": 0.0703125 }, { "epoch": 0.24886296223057147, "step": 2517, "train/total_loss": 0.18945778906345367 }, { "entropy": 9.021735191345215, "epoch": 0.248961835080087, "mean_token_accuracy": 0.7412823438644409, "num_tokens": 13838727.0, "step": 2518, "train/ce_loss": 0.5606525540351868 }, { "epoch": 0.248961835080087, "step": 2518, "train/sim_loss": 0.02734375 }, { "epoch": 0.248961835080087, "step": 2518, "train/total_loss": 0.08340901136398315 }, { "entropy": 9.337507247924805, "epoch": 0.24906070792960253, "mean_token_accuracy": 0.7256990671157837, "num_tokens": 13844045.0, "step": 2519, "train/ce_loss": 0.956026554107666 }, { "epoch": 0.24906070792960253, "step": 2519, "train/sim_loss": 0.0703125 }, { "epoch": 0.24906070792960253, "step": 2519, "train/total_loss": 0.16591516137123108 }, { "epoch": 0.24915958077911807, "grad_norm": 0.9964116215705872, "learning_rate": 9.379666716115316e-06, "loss": 0.148, "step": 2520 }, { "entropy": 8.847679138183594, "epoch": 0.24915958077911807, "mean_token_accuracy": 0.7385984659194946, "num_tokens": 13849596.0, "step": 2520, "train/ce_loss": 0.8532182574272156 }, { "epoch": 0.24915958077911807, "step": 2520, "train/sim_loss": 0.13671875 }, { "epoch": 0.24915958077911807, "step": 2520, "train/total_loss": 0.2220405787229538 }, { "entropy": 9.304072380065918, "epoch": 0.24925845362863358, "mean_token_accuracy": 0.747586190700531, "num_tokens": 13854950.0, "step": 2521, "train/ce_loss": 0.5944007039070129 }, { "epoch": 0.24925845362863358, "step": 2521, "train/sim_loss": 0.0625 }, { "epoch": 0.24925845362863358, "step": 2521, "train/total_loss": 0.12194007635116577 }, { "entropy": 9.101921081542969, "epoch": 0.2493573264781491, "mean_token_accuracy": 0.6928746700286865, "num_tokens": 13860397.0, "step": 2522, "train/ce_loss": 1.0735619068145752 }, { "epoch": 0.2493573264781491, "step": 2522, "train/sim_loss": 0.0625 }, { "epoch": 0.2493573264781491, "step": 2522, "train/total_loss": 0.16985619068145752 }, { "entropy": 9.114954948425293, "epoch": 0.24945619932766464, "mean_token_accuracy": 0.7456140518188477, "num_tokens": 13865848.0, "step": 2523, "train/ce_loss": 0.7876490354537964 }, { "epoch": 0.24945619932766464, "step": 2523, "train/sim_loss": 0.05078125 }, { "epoch": 0.24945619932766464, "step": 2523, "train/total_loss": 0.1295461654663086 }, { "entropy": 9.06376838684082, "epoch": 0.24955507217718015, "mean_token_accuracy": 0.7401130199432373, "num_tokens": 13871318.0, "step": 2524, "train/ce_loss": 0.8126762509346008 }, { "epoch": 0.24955507217718015, "step": 2524, "train/sim_loss": 0.02734375 }, { "epoch": 0.24955507217718015, "step": 2524, "train/total_loss": 0.10861137509346008 }, { "entropy": 9.160922050476074, "epoch": 0.24965394502669566, "mean_token_accuracy": 0.7076923251152039, "num_tokens": 13876799.0, "step": 2525, "train/ce_loss": 0.5388709306716919 }, { "epoch": 0.24965394502669566, "step": 2525, "train/sim_loss": 0.08203125 }, { "epoch": 0.24965394502669566, "step": 2525, "train/total_loss": 0.13591834902763367 }, { "entropy": 8.734068870544434, "epoch": 0.2497528178762112, "mean_token_accuracy": 0.7286995649337769, "num_tokens": 13882409.0, "step": 2526, "train/ce_loss": 1.1146831512451172 }, { "epoch": 0.2497528178762112, "step": 2526, "train/sim_loss": 0.08203125 }, { "epoch": 0.2497528178762112, "step": 2526, "train/total_loss": 0.19349956512451172 }, { "entropy": 8.851436614990234, "epoch": 0.24985169072572672, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 13888035.0, "step": 2527, "train/ce_loss": 1.0086554288864136 }, { "epoch": 0.24985169072572672, "step": 2527, "train/sim_loss": 0.0546875 }, { "epoch": 0.24985169072572672, "step": 2527, "train/total_loss": 0.15555304288864136 }, { "entropy": 9.081197738647461, "epoch": 0.24995056357524223, "mean_token_accuracy": 0.7620614171028137, "num_tokens": 13893726.0, "step": 2528, "train/ce_loss": 0.8695942163467407 }, { "epoch": 0.24995056357524223, "step": 2528, "train/sim_loss": 0.11328125 }, { "epoch": 0.24995056357524223, "step": 2528, "train/total_loss": 0.20024067163467407 }, { "entropy": 8.67812442779541, "epoch": 0.25004943642475774, "mean_token_accuracy": 0.724609375, "num_tokens": 13899265.0, "step": 2529, "train/ce_loss": 1.801985263824463 }, { "epoch": 0.25004943642475774, "step": 2529, "train/sim_loss": 0.05859375 }, { "epoch": 0.25004943642475774, "step": 2529, "train/total_loss": 0.238792285323143 }, { "entropy": 8.606287002563477, "epoch": 0.2501483092742733, "mean_token_accuracy": 0.6785110235214233, "num_tokens": 13905075.0, "step": 2530, "train/ce_loss": 0.4884294867515564 }, { "epoch": 0.2501483092742733, "step": 2530, "train/sim_loss": 0.0859375 }, { "epoch": 0.2501483092742733, "step": 2530, "train/total_loss": 0.13478045165538788 }, { "entropy": 8.811017990112305, "epoch": 0.2502471821237888, "mean_token_accuracy": 0.7540805339813232, "num_tokens": 13910671.0, "step": 2531, "train/ce_loss": 1.056779146194458 }, { "epoch": 0.2502471821237888, "step": 2531, "train/sim_loss": 0.1171875 }, { "epoch": 0.2502471821237888, "step": 2531, "train/total_loss": 0.22286541759967804 }, { "entropy": 8.98241138458252, "epoch": 0.2503460549733043, "mean_token_accuracy": 0.7832929491996765, "num_tokens": 13916151.0, "step": 2532, "train/ce_loss": 0.6610193252563477 }, { "epoch": 0.2503460549733043, "step": 2532, "train/sim_loss": 0.09375 }, { "epoch": 0.2503460549733043, "step": 2532, "train/total_loss": 0.15985193848609924 }, { "entropy": 8.971296310424805, "epoch": 0.25044492782281985, "mean_token_accuracy": 0.6707946062088013, "num_tokens": 13921722.0, "step": 2533, "train/ce_loss": 0.8310142159461975 }, { "epoch": 0.25044492782281985, "step": 2533, "train/sim_loss": 0.0234375 }, { "epoch": 0.25044492782281985, "step": 2533, "train/total_loss": 0.10653892159461975 }, { "entropy": 8.904974937438965, "epoch": 0.2505438006723354, "mean_token_accuracy": 0.6997840404510498, "num_tokens": 13927300.0, "step": 2534, "train/ce_loss": 0.7866368889808655 }, { "epoch": 0.2505438006723354, "step": 2534, "train/sim_loss": 0.05078125 }, { "epoch": 0.2505438006723354, "step": 2534, "train/total_loss": 0.1294449418783188 }, { "entropy": 8.772804260253906, "epoch": 0.2506426735218509, "mean_token_accuracy": 0.7449735403060913, "num_tokens": 13932850.0, "step": 2535, "train/ce_loss": 0.8663722276687622 }, { "epoch": 0.2506426735218509, "step": 2535, "train/sim_loss": 0.0859375 }, { "epoch": 0.2506426735218509, "step": 2535, "train/total_loss": 0.1725747287273407 }, { "entropy": 8.628971099853516, "epoch": 0.2507415463713664, "mean_token_accuracy": 0.7419659495353699, "num_tokens": 13938526.0, "step": 2536, "train/ce_loss": 0.5323888659477234 }, { "epoch": 0.2507415463713664, "step": 2536, "train/sim_loss": 0.0546875 }, { "epoch": 0.2507415463713664, "step": 2536, "train/total_loss": 0.1079263836145401 }, { "entropy": 9.032991409301758, "epoch": 0.25084041922088196, "mean_token_accuracy": 0.8038277626037598, "num_tokens": 13943983.0, "step": 2537, "train/ce_loss": 0.49052444100379944 }, { "epoch": 0.25084041922088196, "step": 2537, "train/sim_loss": 0.09375 }, { "epoch": 0.25084041922088196, "step": 2537, "train/total_loss": 0.14280244708061218 }, { "entropy": 8.92033576965332, "epoch": 0.25093929207039745, "mean_token_accuracy": 0.7216035723686218, "num_tokens": 13949427.0, "step": 2538, "train/ce_loss": 0.7838791608810425 }, { "epoch": 0.25093929207039745, "step": 2538, "train/sim_loss": 0.0625 }, { "epoch": 0.25093929207039745, "step": 2538, "train/total_loss": 0.14088791608810425 }, { "entropy": 8.896997451782227, "epoch": 0.251038164919913, "mean_token_accuracy": 0.7434841990470886, "num_tokens": 13954802.0, "step": 2539, "train/ce_loss": 0.7234665155410767 }, { "epoch": 0.251038164919913, "step": 2539, "train/sim_loss": 0.0703125 }, { "epoch": 0.251038164919913, "step": 2539, "train/total_loss": 0.14265915751457214 }, { "epoch": 0.25113703776942853, "grad_norm": 0.9829937815666199, "learning_rate": 9.374721851357365e-06, "loss": 0.1556, "step": 2540 }, { "entropy": 9.55469799041748, "epoch": 0.25113703776942853, "mean_token_accuracy": 0.7661927342414856, "num_tokens": 13960027.0, "step": 2540, "train/ce_loss": 0.774243175983429 }, { "epoch": 0.25113703776942853, "step": 2540, "train/sim_loss": 0.05078125 }, { "epoch": 0.25113703776942853, "step": 2540, "train/total_loss": 0.1282055675983429 }, { "entropy": 8.970592498779297, "epoch": 0.251235910618944, "mean_token_accuracy": 0.7553793787956238, "num_tokens": 13965589.0, "step": 2541, "train/ce_loss": 0.634501039981842 }, { "epoch": 0.251235910618944, "step": 2541, "train/sim_loss": 0.0234375 }, { "epoch": 0.251235910618944, "step": 2541, "train/total_loss": 0.08688760548830032 }, { "entropy": 9.122916221618652, "epoch": 0.25133478346845955, "mean_token_accuracy": 0.7654321193695068, "num_tokens": 13971218.0, "step": 2542, "train/ce_loss": 0.5776782035827637 }, { "epoch": 0.25133478346845955, "step": 2542, "train/sim_loss": 0.05078125 }, { "epoch": 0.25133478346845955, "step": 2542, "train/total_loss": 0.1085490733385086 }, { "entropy": 9.117633819580078, "epoch": 0.2514336563179751, "mean_token_accuracy": 0.7087845802307129, "num_tokens": 13976719.0, "step": 2543, "train/ce_loss": 1.1118882894515991 }, { "epoch": 0.2514336563179751, "step": 2543, "train/sim_loss": 0.109375 }, { "epoch": 0.2514336563179751, "step": 2543, "train/total_loss": 0.2205638289451599 }, { "entropy": 9.410158157348633, "epoch": 0.2515325291674906, "mean_token_accuracy": 0.7398256063461304, "num_tokens": 13982018.0, "step": 2544, "train/ce_loss": 0.734348475933075 }, { "epoch": 0.2515325291674906, "step": 2544, "train/sim_loss": 0.05859375 }, { "epoch": 0.2515325291674906, "step": 2544, "train/total_loss": 0.13202860951423645 }, { "entropy": 9.497407913208008, "epoch": 0.2516314020170061, "mean_token_accuracy": 0.7537091970443726, "num_tokens": 13987327.0, "step": 2545, "train/ce_loss": 1.2087719440460205 }, { "epoch": 0.2516314020170061, "step": 2545, "train/sim_loss": 0.19140625 }, { "epoch": 0.2516314020170061, "step": 2545, "train/total_loss": 0.312283456325531 }, { "entropy": 9.19740104675293, "epoch": 0.25173027486652166, "mean_token_accuracy": 0.6791907548904419, "num_tokens": 13992610.0, "step": 2546, "train/ce_loss": 1.5149204730987549 }, { "epoch": 0.25173027486652166, "step": 2546, "train/sim_loss": 0.09765625 }, { "epoch": 0.25173027486652166, "step": 2546, "train/total_loss": 0.24914829432964325 }, { "entropy": 9.047210693359375, "epoch": 0.25182914771603715, "mean_token_accuracy": 0.7484884858131409, "num_tokens": 13998084.0, "step": 2547, "train/ce_loss": 1.2397617101669312 }, { "epoch": 0.25182914771603715, "step": 2547, "train/sim_loss": 0.05078125 }, { "epoch": 0.25182914771603715, "step": 2547, "train/total_loss": 0.17475742101669312 }, { "entropy": 8.59426498413086, "epoch": 0.2519280205655527, "mean_token_accuracy": 0.7020373344421387, "num_tokens": 14003793.0, "step": 2548, "train/ce_loss": 0.4652705788612366 }, { "epoch": 0.2519280205655527, "step": 2548, "train/sim_loss": 0.1015625 }, { "epoch": 0.2519280205655527, "step": 2548, "train/total_loss": 0.14808955788612366 }, { "entropy": 8.507165908813477, "epoch": 0.25202689341506823, "mean_token_accuracy": 0.6884120106697083, "num_tokens": 14009579.0, "step": 2549, "train/ce_loss": 0.955007791519165 }, { "epoch": 0.25202689341506823, "step": 2549, "train/sim_loss": 0.125 }, { "epoch": 0.25202689341506823, "step": 2549, "train/total_loss": 0.22050078213214874 }, { "entropy": 9.0337553024292, "epoch": 0.2521257662645838, "mean_token_accuracy": 0.7144549489021301, "num_tokens": 14015065.0, "step": 2550, "train/ce_loss": 0.8544270992279053 }, { "epoch": 0.2521257662645838, "step": 2550, "train/sim_loss": 0.078125 }, { "epoch": 0.2521257662645838, "step": 2550, "train/total_loss": 0.16356772184371948 }, { "entropy": 8.986995697021484, "epoch": 0.25222463911409926, "mean_token_accuracy": 0.7796609997749329, "num_tokens": 14020660.0, "step": 2551, "train/ce_loss": 1.284815788269043 }, { "epoch": 0.25222463911409926, "step": 2551, "train/sim_loss": 0.0859375 }, { "epoch": 0.25222463911409926, "step": 2551, "train/total_loss": 0.21441908180713654 }, { "entropy": 9.203989028930664, "epoch": 0.2523235119636148, "mean_token_accuracy": 0.7861482501029968, "num_tokens": 14026074.0, "step": 2552, "train/ce_loss": 0.6312835216522217 }, { "epoch": 0.2523235119636148, "step": 2552, "train/sim_loss": 0.03125 }, { "epoch": 0.2523235119636148, "step": 2552, "train/total_loss": 0.09437835216522217 }, { "entropy": 9.188783645629883, "epoch": 0.25242238481313034, "mean_token_accuracy": 0.7429620623588562, "num_tokens": 14031546.0, "step": 2553, "train/ce_loss": 0.6761230826377869 }, { "epoch": 0.25242238481313034, "step": 2553, "train/sim_loss": 0.0546875 }, { "epoch": 0.25242238481313034, "step": 2553, "train/total_loss": 0.12229981273412704 }, { "entropy": 9.352448463439941, "epoch": 0.2525212576626458, "mean_token_accuracy": 0.7630331516265869, "num_tokens": 14036884.0, "step": 2554, "train/ce_loss": 0.5977505445480347 }, { "epoch": 0.2525212576626458, "step": 2554, "train/sim_loss": 0.05078125 }, { "epoch": 0.2525212576626458, "step": 2554, "train/total_loss": 0.11055630445480347 }, { "entropy": 8.91718864440918, "epoch": 0.25262013051216137, "mean_token_accuracy": 0.7608225345611572, "num_tokens": 14042377.0, "step": 2555, "train/ce_loss": 0.5179446935653687 }, { "epoch": 0.25262013051216137, "step": 2555, "train/sim_loss": 0.03125 }, { "epoch": 0.25262013051216137, "step": 2555, "train/total_loss": 0.08304446935653687 }, { "entropy": 9.311704635620117, "epoch": 0.2527190033616769, "mean_token_accuracy": 0.7140957713127136, "num_tokens": 14047756.0, "step": 2556, "train/ce_loss": 0.8799871206283569 }, { "epoch": 0.2527190033616769, "step": 2556, "train/sim_loss": 0.14453125 }, { "epoch": 0.2527190033616769, "step": 2556, "train/total_loss": 0.23252996802330017 }, { "entropy": 9.115615844726562, "epoch": 0.2528178762111924, "mean_token_accuracy": 0.7230955362319946, "num_tokens": 14053193.0, "step": 2557, "train/ce_loss": 0.6646724343299866 }, { "epoch": 0.2528178762111924, "step": 2557, "train/sim_loss": 0.08984375 }, { "epoch": 0.2528178762111924, "step": 2557, "train/total_loss": 0.1563110053539276 }, { "entropy": 9.137027740478516, "epoch": 0.25291674906070793, "mean_token_accuracy": 0.7578579783439636, "num_tokens": 14058655.0, "step": 2558, "train/ce_loss": 0.6556375026702881 }, { "epoch": 0.25291674906070793, "step": 2558, "train/sim_loss": 0.05078125 }, { "epoch": 0.25291674906070793, "step": 2558, "train/total_loss": 0.11634500324726105 }, { "entropy": 9.288435935974121, "epoch": 0.2530156219102235, "mean_token_accuracy": 0.7220077514648438, "num_tokens": 14064030.0, "step": 2559, "train/ce_loss": 0.844292938709259 }, { "epoch": 0.2530156219102235, "step": 2559, "train/sim_loss": 0.078125 }, { "epoch": 0.2530156219102235, "step": 2559, "train/total_loss": 0.1625542938709259 }, { "epoch": 0.25311449475973896, "grad_norm": 0.9486482739448547, "learning_rate": 9.369776986599417e-06, "loss": 0.1532, "step": 2560 }, { "entropy": 8.846431732177734, "epoch": 0.25311449475973896, "mean_token_accuracy": 0.7317073345184326, "num_tokens": 14069465.0, "step": 2560, "train/ce_loss": 0.587955892086029 }, { "epoch": 0.25311449475973896, "step": 2560, "train/sim_loss": 0.1171875 }, { "epoch": 0.25311449475973896, "step": 2560, "train/total_loss": 0.17598308622837067 }, { "entropy": 9.066347122192383, "epoch": 0.2532133676092545, "mean_token_accuracy": 0.7060703039169312, "num_tokens": 14074917.0, "step": 2561, "train/ce_loss": 0.6674204468727112 }, { "epoch": 0.2532133676092545, "step": 2561, "train/sim_loss": 0.08203125 }, { "epoch": 0.2532133676092545, "step": 2561, "train/total_loss": 0.14877329766750336 }, { "entropy": 9.319612503051758, "epoch": 0.25331224045877004, "mean_token_accuracy": 0.7327249050140381, "num_tokens": 14080237.0, "step": 2562, "train/ce_loss": 1.0373141765594482 }, { "epoch": 0.25331224045877004, "step": 2562, "train/sim_loss": 0.0625 }, { "epoch": 0.25331224045877004, "step": 2562, "train/total_loss": 0.1662314236164093 }, { "entropy": 8.78535270690918, "epoch": 0.25341111330828553, "mean_token_accuracy": 0.7599225640296936, "num_tokens": 14085870.0, "step": 2563, "train/ce_loss": 0.6763502955436707 }, { "epoch": 0.25341111330828553, "step": 2563, "train/sim_loss": 0.0625 }, { "epoch": 0.25341111330828553, "step": 2563, "train/total_loss": 0.13013502955436707 }, { "entropy": 9.173588752746582, "epoch": 0.25350998615780107, "mean_token_accuracy": 0.7469135522842407, "num_tokens": 14091279.0, "step": 2564, "train/ce_loss": 0.657570481300354 }, { "epoch": 0.25350998615780107, "step": 2564, "train/sim_loss": 0.046875 }, { "epoch": 0.25350998615780107, "step": 2564, "train/total_loss": 0.11263205111026764 }, { "entropy": 9.188470840454102, "epoch": 0.2536088590073166, "mean_token_accuracy": 0.6910994648933411, "num_tokens": 14096672.0, "step": 2565, "train/ce_loss": 1.2209442853927612 }, { "epoch": 0.2536088590073166, "step": 2565, "train/sim_loss": 0.0859375 }, { "epoch": 0.2536088590073166, "step": 2565, "train/total_loss": 0.20803192257881165 }, { "entropy": 9.100025177001953, "epoch": 0.2537077318568321, "mean_token_accuracy": 0.6995412707328796, "num_tokens": 14102175.0, "step": 2566, "train/ce_loss": 1.1522414684295654 }, { "epoch": 0.2537077318568321, "step": 2566, "train/sim_loss": 0.11328125 }, { "epoch": 0.2537077318568321, "step": 2566, "train/total_loss": 0.22850540280342102 }, { "entropy": 8.742734909057617, "epoch": 0.25380660470634764, "mean_token_accuracy": 0.7040913701057434, "num_tokens": 14107797.0, "step": 2567, "train/ce_loss": 1.7403491735458374 }, { "epoch": 0.25380660470634764, "step": 2567, "train/sim_loss": 0.05859375 }, { "epoch": 0.25380660470634764, "step": 2567, "train/total_loss": 0.23262867331504822 }, { "entropy": 9.033987045288086, "epoch": 0.2539054775558632, "mean_token_accuracy": 0.7449584603309631, "num_tokens": 14113258.0, "step": 2568, "train/ce_loss": 1.4428044557571411 }, { "epoch": 0.2539054775558632, "step": 2568, "train/sim_loss": 0.12109375 }, { "epoch": 0.2539054775558632, "step": 2568, "train/total_loss": 0.26537418365478516 }, { "entropy": 8.952230453491211, "epoch": 0.25400435040537866, "mean_token_accuracy": 0.6953907608985901, "num_tokens": 14118919.0, "step": 2569, "train/ce_loss": 1.4586446285247803 }, { "epoch": 0.25400435040537866, "step": 2569, "train/sim_loss": 0.12890625 }, { "epoch": 0.25400435040537866, "step": 2569, "train/total_loss": 0.27477073669433594 }, { "entropy": 8.932051658630371, "epoch": 0.2541032232548942, "mean_token_accuracy": 0.7299794554710388, "num_tokens": 14124481.0, "step": 2570, "train/ce_loss": 0.42366865277290344 }, { "epoch": 0.2541032232548942, "step": 2570, "train/sim_loss": 0.0859375 }, { "epoch": 0.2541032232548942, "step": 2570, "train/total_loss": 0.1283043622970581 }, { "entropy": 9.08497428894043, "epoch": 0.25420209610440975, "mean_token_accuracy": 0.6903991103172302, "num_tokens": 14129999.0, "step": 2571, "train/ce_loss": 1.0506181716918945 }, { "epoch": 0.25420209610440975, "step": 2571, "train/sim_loss": 0.125 }, { "epoch": 0.25420209610440975, "step": 2571, "train/total_loss": 0.2300618290901184 }, { "entropy": 9.311030387878418, "epoch": 0.25430096895392523, "mean_token_accuracy": 0.753824770450592, "num_tokens": 14135311.0, "step": 2572, "train/ce_loss": 0.6700097322463989 }, { "epoch": 0.25430096895392523, "step": 2572, "train/sim_loss": 0.0859375 }, { "epoch": 0.25430096895392523, "step": 2572, "train/total_loss": 0.15293848514556885 }, { "entropy": 9.006789207458496, "epoch": 0.2543998418034408, "mean_token_accuracy": 0.7106842994689941, "num_tokens": 14140778.0, "step": 2573, "train/ce_loss": 0.6790409684181213 }, { "epoch": 0.2543998418034408, "step": 2573, "train/sim_loss": 0.0390625 }, { "epoch": 0.2543998418034408, "step": 2573, "train/total_loss": 0.10696659982204437 }, { "entropy": 8.984986305236816, "epoch": 0.2544987146529563, "mean_token_accuracy": 0.7022900581359863, "num_tokens": 14146281.0, "step": 2574, "train/ce_loss": 0.8778839111328125 }, { "epoch": 0.2544987146529563, "step": 2574, "train/sim_loss": 0.0546875 }, { "epoch": 0.2544987146529563, "step": 2574, "train/total_loss": 0.1424759030342102 }, { "entropy": 8.577339172363281, "epoch": 0.2545975875024718, "mean_token_accuracy": 0.7319013476371765, "num_tokens": 14152133.0, "step": 2575, "train/ce_loss": 0.2899893522262573 }, { "epoch": 0.2545975875024718, "step": 2575, "train/sim_loss": 0.0390625 }, { "epoch": 0.2545975875024718, "step": 2575, "train/total_loss": 0.06806143373250961 }, { "entropy": 8.725369453430176, "epoch": 0.25469646035198734, "mean_token_accuracy": 0.7669584155082703, "num_tokens": 14157720.0, "step": 2576, "train/ce_loss": 1.361704707145691 }, { "epoch": 0.25469646035198734, "step": 2576, "train/sim_loss": 0.0859375 }, { "epoch": 0.25469646035198734, "step": 2576, "train/total_loss": 0.22210797667503357 }, { "entropy": 9.461631774902344, "epoch": 0.2547953332015029, "mean_token_accuracy": 0.8125, "num_tokens": 14163180.0, "step": 2577, "train/ce_loss": 0.6009749174118042 }, { "epoch": 0.2547953332015029, "step": 2577, "train/sim_loss": 0.05078125 }, { "epoch": 0.2547953332015029, "step": 2577, "train/total_loss": 0.11087874323129654 }, { "entropy": 8.531537055969238, "epoch": 0.25489420605101837, "mean_token_accuracy": 0.707554817199707, "num_tokens": 14168982.0, "step": 2578, "train/ce_loss": 0.7559697031974792 }, { "epoch": 0.25489420605101837, "step": 2578, "train/sim_loss": 0.0859375 }, { "epoch": 0.25489420605101837, "step": 2578, "train/total_loss": 0.16153447329998016 }, { "entropy": 8.629059791564941, "epoch": 0.2549930789005339, "mean_token_accuracy": 0.6438468098640442, "num_tokens": 14174781.0, "step": 2579, "train/ce_loss": 1.0316078662872314 }, { "epoch": 0.2549930789005339, "step": 2579, "train/sim_loss": 0.12109375 }, { "epoch": 0.2549930789005339, "step": 2579, "train/total_loss": 0.2242545485496521 }, { "epoch": 0.25509195175004945, "grad_norm": 0.8974201083183289, "learning_rate": 9.364832121841468e-06, "loss": 0.1681, "step": 2580 }, { "entropy": 9.01436996459961, "epoch": 0.25509195175004945, "mean_token_accuracy": 0.7795566320419312, "num_tokens": 14180189.0, "step": 2580, "train/ce_loss": 0.4470849335193634 }, { "epoch": 0.25509195175004945, "step": 2580, "train/sim_loss": 0.0234375 }, { "epoch": 0.25509195175004945, "step": 2580, "train/total_loss": 0.0681459903717041 }, { "entropy": 8.901748657226562, "epoch": 0.25519082459956494, "mean_token_accuracy": 0.7663551568984985, "num_tokens": 14185677.0, "step": 2581, "train/ce_loss": 0.78059321641922 }, { "epoch": 0.25519082459956494, "step": 2581, "train/sim_loss": 0.06640625 }, { "epoch": 0.25519082459956494, "step": 2581, "train/total_loss": 0.14446556568145752 }, { "entropy": 8.956632614135742, "epoch": 0.2552896974490805, "mean_token_accuracy": 0.7136706113815308, "num_tokens": 14191254.0, "step": 2582, "train/ce_loss": 0.7201001048088074 }, { "epoch": 0.2552896974490805, "step": 2582, "train/sim_loss": 0.078125 }, { "epoch": 0.2552896974490805, "step": 2582, "train/total_loss": 0.15013501048088074 }, { "entropy": 9.203222274780273, "epoch": 0.255388570298596, "mean_token_accuracy": 0.7241867184638977, "num_tokens": 14196629.0, "step": 2583, "train/ce_loss": 0.6815394163131714 }, { "epoch": 0.255388570298596, "step": 2583, "train/sim_loss": 0.0546875 }, { "epoch": 0.255388570298596, "step": 2583, "train/total_loss": 0.12284144014120102 }, { "entropy": 9.067233085632324, "epoch": 0.2554874431481115, "mean_token_accuracy": 0.6758939027786255, "num_tokens": 14202138.0, "step": 2584, "train/ce_loss": 0.5596544146537781 }, { "epoch": 0.2554874431481115, "step": 2584, "train/sim_loss": 0.06640625 }, { "epoch": 0.2554874431481115, "step": 2584, "train/total_loss": 0.12237168848514557 }, { "entropy": 9.182413101196289, "epoch": 0.25558631599762704, "mean_token_accuracy": 0.7395683526992798, "num_tokens": 14207423.0, "step": 2585, "train/ce_loss": 0.9102421402931213 }, { "epoch": 0.25558631599762704, "step": 2585, "train/sim_loss": 0.0859375 }, { "epoch": 0.25558631599762704, "step": 2585, "train/total_loss": 0.1769617199897766 }, { "entropy": 8.659276962280273, "epoch": 0.2556851888471426, "mean_token_accuracy": 0.7088353633880615, "num_tokens": 14213045.0, "step": 2586, "train/ce_loss": 1.1355135440826416 }, { "epoch": 0.2556851888471426, "step": 2586, "train/sim_loss": 0.0703125 }, { "epoch": 0.2556851888471426, "step": 2586, "train/total_loss": 0.18386384844779968 }, { "entropy": 9.054567337036133, "epoch": 0.25578406169665807, "mean_token_accuracy": 0.7288135886192322, "num_tokens": 14218499.0, "step": 2587, "train/ce_loss": 0.5818638205528259 }, { "epoch": 0.25578406169665807, "step": 2587, "train/sim_loss": 0.06640625 }, { "epoch": 0.25578406169665807, "step": 2587, "train/total_loss": 0.12459263205528259 }, { "entropy": 8.772193908691406, "epoch": 0.2558829345461736, "mean_token_accuracy": 0.7903075218200684, "num_tokens": 14224167.0, "step": 2588, "train/ce_loss": 0.5818676948547363 }, { "epoch": 0.2558829345461736, "step": 2588, "train/sim_loss": 0.01953125 }, { "epoch": 0.2558829345461736, "step": 2588, "train/total_loss": 0.07771801948547363 }, { "entropy": 9.162617683410645, "epoch": 0.25598180739568915, "mean_token_accuracy": 0.7299528121948242, "num_tokens": 14229592.0, "step": 2589, "train/ce_loss": 0.544422447681427 }, { "epoch": 0.25598180739568915, "step": 2589, "train/sim_loss": 0.03125 }, { "epoch": 0.25598180739568915, "step": 2589, "train/total_loss": 0.08569224178791046 }, { "entropy": 8.877948760986328, "epoch": 0.25608068024520464, "mean_token_accuracy": 0.7876404523849487, "num_tokens": 14235086.0, "step": 2590, "train/ce_loss": 0.9537914991378784 }, { "epoch": 0.25608068024520464, "step": 2590, "train/sim_loss": 0.08984375 }, { "epoch": 0.25608068024520464, "step": 2590, "train/total_loss": 0.18522289395332336 }, { "entropy": 8.998027801513672, "epoch": 0.2561795530947202, "mean_token_accuracy": 0.7315436005592346, "num_tokens": 14240564.0, "step": 2591, "train/ce_loss": 0.735234797000885 }, { "epoch": 0.2561795530947202, "step": 2591, "train/sim_loss": 0.0625 }, { "epoch": 0.2561795530947202, "step": 2591, "train/total_loss": 0.13602349162101746 }, { "entropy": 8.94859790802002, "epoch": 0.2562784259442357, "mean_token_accuracy": 0.715969979763031, "num_tokens": 14246116.0, "step": 2592, "train/ce_loss": 1.0629711151123047 }, { "epoch": 0.2562784259442357, "step": 2592, "train/sim_loss": 0.09375 }, { "epoch": 0.2562784259442357, "step": 2592, "train/total_loss": 0.200047105550766 }, { "entropy": 9.006155014038086, "epoch": 0.25637729879375126, "mean_token_accuracy": 0.7503015398979187, "num_tokens": 14251552.0, "step": 2593, "train/ce_loss": 0.6583736538887024 }, { "epoch": 0.25637729879375126, "step": 2593, "train/sim_loss": 0.046875 }, { "epoch": 0.25637729879375126, "step": 2593, "train/total_loss": 0.11271236836910248 }, { "entropy": 9.428951263427734, "epoch": 0.25647617164326675, "mean_token_accuracy": 0.7549770474433899, "num_tokens": 14256820.0, "step": 2594, "train/ce_loss": 0.38979002833366394 }, { "epoch": 0.25647617164326675, "step": 2594, "train/sim_loss": 0.01953125 }, { "epoch": 0.25647617164326675, "step": 2594, "train/total_loss": 0.05851025506854057 }, { "entropy": 9.055510520935059, "epoch": 0.2565750444927823, "mean_token_accuracy": 0.744303822517395, "num_tokens": 14262201.0, "step": 2595, "train/ce_loss": 0.708832323551178 }, { "epoch": 0.2565750444927823, "step": 2595, "train/sim_loss": 0.078125 }, { "epoch": 0.2565750444927823, "step": 2595, "train/total_loss": 0.14900824427604675 }, { "entropy": 9.131780624389648, "epoch": 0.25667391734229783, "mean_token_accuracy": 0.7137930989265442, "num_tokens": 14267869.0, "step": 2596, "train/ce_loss": 1.5732818841934204 }, { "epoch": 0.25667391734229783, "step": 2596, "train/sim_loss": 0.09375 }, { "epoch": 0.25667391734229783, "step": 2596, "train/total_loss": 0.25107818841934204 }, { "entropy": 9.461389541625977, "epoch": 0.2567727901918133, "mean_token_accuracy": 0.7346625924110413, "num_tokens": 14273065.0, "step": 2597, "train/ce_loss": 0.970515787601471 }, { "epoch": 0.2567727901918133, "step": 2597, "train/sim_loss": 0.07421875 }, { "epoch": 0.2567727901918133, "step": 2597, "train/total_loss": 0.17127034068107605 }, { "entropy": 8.806466102600098, "epoch": 0.25687166304132886, "mean_token_accuracy": 0.7485448122024536, "num_tokens": 14278627.0, "step": 2598, "train/ce_loss": 0.47853296995162964 }, { "epoch": 0.25687166304132886, "step": 2598, "train/sim_loss": 0.06640625 }, { "epoch": 0.25687166304132886, "step": 2598, "train/total_loss": 0.11425954848527908 }, { "entropy": 9.043811798095703, "epoch": 0.2569705358908444, "mean_token_accuracy": 0.7928416728973389, "num_tokens": 14284134.0, "step": 2599, "train/ce_loss": 0.483083039522171 }, { "epoch": 0.2569705358908444, "step": 2599, "train/sim_loss": 0.0234375 }, { "epoch": 0.2569705358908444, "step": 2599, "train/total_loss": 0.07174580544233322 }, { "epoch": 0.2570694087403599, "grad_norm": 0.5890930891036987, "learning_rate": 9.35988725708352e-06, "loss": 0.1453, "step": 2600 }, { "entropy": 9.314231872558594, "epoch": 0.2570694087403599, "mean_token_accuracy": 0.7459239363670349, "num_tokens": 14289506.0, "step": 2600, "train/ce_loss": 0.7243672013282776 }, { "epoch": 0.2570694087403599, "step": 2600, "train/sim_loss": 0.08984375 }, { "epoch": 0.2570694087403599, "step": 2600, "train/total_loss": 0.16228047013282776 }, { "entropy": 9.560079574584961, "epoch": 0.2571682815898754, "mean_token_accuracy": 0.7838709950447083, "num_tokens": 14294680.0, "step": 2601, "train/ce_loss": 0.6310282349586487 }, { "epoch": 0.2571682815898754, "step": 2601, "train/sim_loss": 0.08984375 }, { "epoch": 0.2571682815898754, "step": 2601, "train/total_loss": 0.1529465764760971 }, { "entropy": 8.972077369689941, "epoch": 0.25726715443939097, "mean_token_accuracy": 0.7653179168701172, "num_tokens": 14300158.0, "step": 2602, "train/ce_loss": 0.699400007724762 }, { "epoch": 0.25726715443939097, "step": 2602, "train/sim_loss": 0.046875 }, { "epoch": 0.25726715443939097, "step": 2602, "train/total_loss": 0.1168150007724762 }, { "entropy": 9.001385688781738, "epoch": 0.25736602728890645, "mean_token_accuracy": 0.746666669845581, "num_tokens": 14305643.0, "step": 2603, "train/ce_loss": 0.501287043094635 }, { "epoch": 0.25736602728890645, "step": 2603, "train/sim_loss": 0.08984375 }, { "epoch": 0.25736602728890645, "step": 2603, "train/total_loss": 0.13997244834899902 }, { "entropy": 9.138795852661133, "epoch": 0.257464900138422, "mean_token_accuracy": 0.7864293456077576, "num_tokens": 14311079.0, "step": 2604, "train/ce_loss": 0.524766743183136 }, { "epoch": 0.257464900138422, "step": 2604, "train/sim_loss": 0.08984375 }, { "epoch": 0.257464900138422, "step": 2604, "train/total_loss": 0.1423204243183136 }, { "entropy": 8.146110534667969, "epoch": 0.25756377298793753, "mean_token_accuracy": 0.7345537543296814, "num_tokens": 14316919.0, "step": 2605, "train/ce_loss": 1.2498372793197632 }, { "epoch": 0.25756377298793753, "step": 2605, "train/sim_loss": 0.05859375 }, { "epoch": 0.25756377298793753, "step": 2605, "train/total_loss": 0.18357747793197632 }, { "entropy": 9.228160858154297, "epoch": 0.257662645837453, "mean_token_accuracy": 0.8196318745613098, "num_tokens": 14322379.0, "step": 2606, "train/ce_loss": 0.5155640244483948 }, { "epoch": 0.257662645837453, "step": 2606, "train/sim_loss": 0.0859375 }, { "epoch": 0.257662645837453, "step": 2606, "train/total_loss": 0.13749390840530396 }, { "entropy": 8.845966339111328, "epoch": 0.25776151868696856, "mean_token_accuracy": 0.769859790802002, "num_tokens": 14327876.0, "step": 2607, "train/ce_loss": 0.8632655739784241 }, { "epoch": 0.25776151868696856, "step": 2607, "train/sim_loss": 0.03515625 }, { "epoch": 0.25776151868696856, "step": 2607, "train/total_loss": 0.12148281186819077 }, { "entropy": 9.108524322509766, "epoch": 0.2578603915364841, "mean_token_accuracy": 0.7468499541282654, "num_tokens": 14333385.0, "step": 2608, "train/ce_loss": 0.6651149392127991 }, { "epoch": 0.2578603915364841, "step": 2608, "train/sim_loss": 0.06640625 }, { "epoch": 0.2578603915364841, "step": 2608, "train/total_loss": 0.13291774690151215 }, { "entropy": 9.396760940551758, "epoch": 0.2579592643859996, "mean_token_accuracy": 0.748344361782074, "num_tokens": 14338637.0, "step": 2609, "train/ce_loss": 0.6508448719978333 }, { "epoch": 0.2579592643859996, "step": 2609, "train/sim_loss": 0.046875 }, { "epoch": 0.2579592643859996, "step": 2609, "train/total_loss": 0.11195948719978333 }, { "entropy": 9.137845993041992, "epoch": 0.25805813723551513, "mean_token_accuracy": 0.7288776636123657, "num_tokens": 14344062.0, "step": 2610, "train/ce_loss": 1.358577013015747 }, { "epoch": 0.25805813723551513, "step": 2610, "train/sim_loss": 0.046875 }, { "epoch": 0.25805813723551513, "step": 2610, "train/total_loss": 0.1827327013015747 }, { "entropy": 9.031509399414062, "epoch": 0.25815701008503067, "mean_token_accuracy": 0.7151819467544556, "num_tokens": 14349450.0, "step": 2611, "train/ce_loss": 1.036136507987976 }, { "epoch": 0.25815701008503067, "step": 2611, "train/sim_loss": 0.0546875 }, { "epoch": 0.25815701008503067, "step": 2611, "train/total_loss": 0.15830114483833313 }, { "entropy": 9.035884857177734, "epoch": 0.25825588293454615, "mean_token_accuracy": 0.7664399147033691, "num_tokens": 14354967.0, "step": 2612, "train/ce_loss": 1.0372642278671265 }, { "epoch": 0.25825588293454615, "step": 2612, "train/sim_loss": 0.078125 }, { "epoch": 0.25825588293454615, "step": 2612, "train/total_loss": 0.18185141682624817 }, { "entropy": 8.852540969848633, "epoch": 0.2583547557840617, "mean_token_accuracy": 0.7558268308639526, "num_tokens": 14360445.0, "step": 2613, "train/ce_loss": 0.4478164315223694 }, { "epoch": 0.2583547557840617, "step": 2613, "train/sim_loss": 0.05859375 }, { "epoch": 0.2583547557840617, "step": 2613, "train/total_loss": 0.1033753901720047 }, { "entropy": 8.85840129852295, "epoch": 0.25845362863357724, "mean_token_accuracy": 0.7278350591659546, "num_tokens": 14366052.0, "step": 2614, "train/ce_loss": 0.922221839427948 }, { "epoch": 0.25845362863357724, "step": 2614, "train/sim_loss": 0.01953125 }, { "epoch": 0.25845362863357724, "step": 2614, "train/total_loss": 0.1117534339427948 }, { "entropy": 9.054003715515137, "epoch": 0.2585525014830927, "mean_token_accuracy": 0.7763975262641907, "num_tokens": 14371396.0, "step": 2615, "train/ce_loss": 0.35334473848342896 }, { "epoch": 0.2585525014830927, "step": 2615, "train/sim_loss": 0.09765625 }, { "epoch": 0.2585525014830927, "step": 2615, "train/total_loss": 0.13299071788787842 }, { "entropy": 9.1432466506958, "epoch": 0.25865137433260826, "mean_token_accuracy": 0.7674698829650879, "num_tokens": 14376764.0, "step": 2616, "train/ce_loss": 0.7744244933128357 }, { "epoch": 0.25865137433260826, "step": 2616, "train/sim_loss": 0.0546875 }, { "epoch": 0.25865137433260826, "step": 2616, "train/total_loss": 0.1321299523115158 }, { "entropy": 8.845569610595703, "epoch": 0.2587502471821238, "mean_token_accuracy": 0.752860426902771, "num_tokens": 14382427.0, "step": 2617, "train/ce_loss": 0.8329853415489197 }, { "epoch": 0.2587502471821238, "step": 2617, "train/sim_loss": 0.07421875 }, { "epoch": 0.2587502471821238, "step": 2617, "train/total_loss": 0.15751728415489197 }, { "entropy": 9.115957260131836, "epoch": 0.2588491200316393, "mean_token_accuracy": 0.7370325922966003, "num_tokens": 14387907.0, "step": 2618, "train/ce_loss": 0.6805377006530762 }, { "epoch": 0.2588491200316393, "step": 2618, "train/sim_loss": 0.0390625 }, { "epoch": 0.2588491200316393, "step": 2618, "train/total_loss": 0.10711627453565598 }, { "entropy": 9.02101993560791, "epoch": 0.25894799288115483, "mean_token_accuracy": 0.7348734736442566, "num_tokens": 14393462.0, "step": 2619, "train/ce_loss": 0.7425099611282349 }, { "epoch": 0.25894799288115483, "step": 2619, "train/sim_loss": 0.0546875 }, { "epoch": 0.25894799288115483, "step": 2619, "train/total_loss": 0.1289384961128235 }, { "epoch": 0.2590468657306704, "grad_norm": 0.8057045936584473, "learning_rate": 9.35494239232557e-06, "loss": 0.1382, "step": 2620 }, { "entropy": 8.93667984008789, "epoch": 0.2590468657306704, "mean_token_accuracy": 0.7585812211036682, "num_tokens": 14398972.0, "step": 2620, "train/ce_loss": 0.43168574571609497 }, { "epoch": 0.2590468657306704, "step": 2620, "train/sim_loss": 0.0234375 }, { "epoch": 0.2590468657306704, "step": 2620, "train/total_loss": 0.0666060745716095 }, { "entropy": 8.945545196533203, "epoch": 0.25914573858018586, "mean_token_accuracy": 0.8189845681190491, "num_tokens": 14404451.0, "step": 2621, "train/ce_loss": 0.5070872902870178 }, { "epoch": 0.25914573858018586, "step": 2621, "train/sim_loss": 0.0390625 }, { "epoch": 0.25914573858018586, "step": 2621, "train/total_loss": 0.08977122604846954 }, { "entropy": 9.064010620117188, "epoch": 0.2592446114297014, "mean_token_accuracy": 0.7494004964828491, "num_tokens": 14409910.0, "step": 2622, "train/ce_loss": 0.803912878036499 }, { "epoch": 0.2592446114297014, "step": 2622, "train/sim_loss": 0.109375 }, { "epoch": 0.2592446114297014, "step": 2622, "train/total_loss": 0.1897662878036499 }, { "entropy": 9.352091789245605, "epoch": 0.25934348427921694, "mean_token_accuracy": 0.7509627938270569, "num_tokens": 14415243.0, "step": 2623, "train/ce_loss": 0.4084916114807129 }, { "epoch": 0.25934348427921694, "step": 2623, "train/sim_loss": 0.046875 }, { "epoch": 0.25934348427921694, "step": 2623, "train/total_loss": 0.08772416412830353 }, { "entropy": 9.247802734375, "epoch": 0.2594423571287324, "mean_token_accuracy": 0.7958903908729553, "num_tokens": 14420623.0, "step": 2624, "train/ce_loss": 0.5731128454208374 }, { "epoch": 0.2594423571287324, "step": 2624, "train/sim_loss": 0.07421875 }, { "epoch": 0.2594423571287324, "step": 2624, "train/total_loss": 0.1315300315618515 }, { "entropy": 8.828897476196289, "epoch": 0.25954122997824797, "mean_token_accuracy": 0.7408638000488281, "num_tokens": 14426179.0, "step": 2625, "train/ce_loss": 0.8128421306610107 }, { "epoch": 0.25954122997824797, "step": 2625, "train/sim_loss": 0.03515625 }, { "epoch": 0.25954122997824797, "step": 2625, "train/total_loss": 0.11644046753644943 }, { "entropy": 9.077314376831055, "epoch": 0.2596401028277635, "mean_token_accuracy": 0.7114754319190979, "num_tokens": 14431744.0, "step": 2626, "train/ce_loss": 0.9831034541130066 }, { "epoch": 0.2596401028277635, "step": 2626, "train/sim_loss": 0.078125 }, { "epoch": 0.2596401028277635, "step": 2626, "train/total_loss": 0.17643535137176514 }, { "entropy": 8.887096405029297, "epoch": 0.259738975677279, "mean_token_accuracy": 0.6965944170951843, "num_tokens": 14437389.0, "step": 2627, "train/ce_loss": 0.8315684199333191 }, { "epoch": 0.259738975677279, "step": 2627, "train/sim_loss": 0.07421875 }, { "epoch": 0.259738975677279, "step": 2627, "train/total_loss": 0.15737560391426086 }, { "entropy": 8.945639610290527, "epoch": 0.25983784852679453, "mean_token_accuracy": 0.7475935816764832, "num_tokens": 14443077.0, "step": 2628, "train/ce_loss": 0.9008129239082336 }, { "epoch": 0.25983784852679453, "step": 2628, "train/sim_loss": 0.0546875 }, { "epoch": 0.25983784852679453, "step": 2628, "train/total_loss": 0.14476880431175232 }, { "entropy": 9.171993255615234, "epoch": 0.2599367213763101, "mean_token_accuracy": 0.7590643167495728, "num_tokens": 14448756.0, "step": 2629, "train/ce_loss": 0.9295548796653748 }, { "epoch": 0.2599367213763101, "step": 2629, "train/sim_loss": 0.11328125 }, { "epoch": 0.2599367213763101, "step": 2629, "train/total_loss": 0.20623674988746643 }, { "entropy": 8.81846809387207, "epoch": 0.26003559422582556, "mean_token_accuracy": 0.8081632852554321, "num_tokens": 14454356.0, "step": 2630, "train/ce_loss": 0.7341740727424622 }, { "epoch": 0.26003559422582556, "step": 2630, "train/sim_loss": 0.0390625 }, { "epoch": 0.26003559422582556, "step": 2630, "train/total_loss": 0.11247991025447845 }, { "entropy": 8.561116218566895, "epoch": 0.2601344670753411, "mean_token_accuracy": 0.6581469774246216, "num_tokens": 14460184.0, "step": 2631, "train/ce_loss": 0.7553200125694275 }, { "epoch": 0.2601344670753411, "step": 2631, "train/sim_loss": 0.078125 }, { "epoch": 0.2601344670753411, "step": 2631, "train/total_loss": 0.153657004237175 }, { "entropy": 8.949241638183594, "epoch": 0.26023333992485664, "mean_token_accuracy": 0.728723406791687, "num_tokens": 14465584.0, "step": 2632, "train/ce_loss": 0.5239099264144897 }, { "epoch": 0.26023333992485664, "step": 2632, "train/sim_loss": 0.03125 }, { "epoch": 0.26023333992485664, "step": 2632, "train/total_loss": 0.08364099264144897 }, { "entropy": 9.003632545471191, "epoch": 0.2603322127743722, "mean_token_accuracy": 0.7689805030822754, "num_tokens": 14471291.0, "step": 2633, "train/ce_loss": 0.5358834862709045 }, { "epoch": 0.2603322127743722, "step": 2633, "train/sim_loss": 0.07421875 }, { "epoch": 0.2603322127743722, "step": 2633, "train/total_loss": 0.12780709564685822 }, { "entropy": 8.554817199707031, "epoch": 0.26043108562388767, "mean_token_accuracy": 0.7088949084281921, "num_tokens": 14477070.0, "step": 2634, "train/ce_loss": 0.8221631646156311 }, { "epoch": 0.26043108562388767, "step": 2634, "train/sim_loss": 0.0703125 }, { "epoch": 0.26043108562388767, "step": 2634, "train/total_loss": 0.1525288224220276 }, { "entropy": 8.899118423461914, "epoch": 0.2605299584734032, "mean_token_accuracy": 0.7034090757369995, "num_tokens": 14482575.0, "step": 2635, "train/ce_loss": 0.9237467050552368 }, { "epoch": 0.2605299584734032, "step": 2635, "train/sim_loss": 0.14453125 }, { "epoch": 0.2605299584734032, "step": 2635, "train/total_loss": 0.23690593242645264 }, { "entropy": 9.275849342346191, "epoch": 0.26062883132291875, "mean_token_accuracy": 0.7800891399383545, "num_tokens": 14487871.0, "step": 2636, "train/ce_loss": 0.8012430667877197 }, { "epoch": 0.26062883132291875, "step": 2636, "train/sim_loss": 0.0390625 }, { "epoch": 0.26062883132291875, "step": 2636, "train/total_loss": 0.11918681114912033 }, { "entropy": 8.736604690551758, "epoch": 0.26072770417243424, "mean_token_accuracy": 0.7281106114387512, "num_tokens": 14493576.0, "step": 2637, "train/ce_loss": 0.687879204750061 }, { "epoch": 0.26072770417243424, "step": 2637, "train/sim_loss": 0.03515625 }, { "epoch": 0.26072770417243424, "step": 2637, "train/total_loss": 0.10394417494535446 }, { "entropy": 9.517419815063477, "epoch": 0.2608265770219498, "mean_token_accuracy": 0.7721139192581177, "num_tokens": 14498812.0, "step": 2638, "train/ce_loss": 0.7335588932037354 }, { "epoch": 0.2608265770219498, "step": 2638, "train/sim_loss": 0.06640625 }, { "epoch": 0.2608265770219498, "step": 2638, "train/total_loss": 0.13976213335990906 }, { "entropy": 9.130793571472168, "epoch": 0.2609254498714653, "mean_token_accuracy": 0.7268463969230652, "num_tokens": 14504286.0, "step": 2639, "train/ce_loss": 0.6219677925109863 }, { "epoch": 0.2609254498714653, "step": 2639, "train/sim_loss": 0.0859375 }, { "epoch": 0.2609254498714653, "step": 2639, "train/total_loss": 0.1481342762708664 }, { "epoch": 0.2610243227209808, "grad_norm": 1.2787595987319946, "learning_rate": 9.34999752756762e-06, "loss": 0.1536, "step": 2640 }, { "entropy": 9.048046112060547, "epoch": 0.2610243227209808, "mean_token_accuracy": 0.6944151520729065, "num_tokens": 14509824.0, "step": 2640, "train/ce_loss": 0.49508151412010193 }, { "epoch": 0.2610243227209808, "step": 2640, "train/sim_loss": 0.07421875 }, { "epoch": 0.2610243227209808, "step": 2640, "train/total_loss": 0.12372690439224243 }, { "entropy": 9.291267395019531, "epoch": 0.26112319557049635, "mean_token_accuracy": 0.7797537446022034, "num_tokens": 14515196.0, "step": 2641, "train/ce_loss": 0.8977423906326294 }, { "epoch": 0.26112319557049635, "step": 2641, "train/sim_loss": 0.0546875 }, { "epoch": 0.26112319557049635, "step": 2641, "train/total_loss": 0.1444617509841919 }, { "entropy": 9.034917831420898, "epoch": 0.2612220684200119, "mean_token_accuracy": 0.7163197994232178, "num_tokens": 14520753.0, "step": 2642, "train/ce_loss": 0.9340407252311707 }, { "epoch": 0.2612220684200119, "step": 2642, "train/sim_loss": 0.04296875 }, { "epoch": 0.2612220684200119, "step": 2642, "train/total_loss": 0.13637283444404602 }, { "entropy": 9.021181106567383, "epoch": 0.2613209412695274, "mean_token_accuracy": 0.7555282711982727, "num_tokens": 14526192.0, "step": 2643, "train/ce_loss": 0.6558517813682556 }, { "epoch": 0.2613209412695274, "step": 2643, "train/sim_loss": 0.05859375 }, { "epoch": 0.2613209412695274, "step": 2643, "train/total_loss": 0.1241789311170578 }, { "entropy": 8.638054847717285, "epoch": 0.2614198141190429, "mean_token_accuracy": 0.7758620977401733, "num_tokens": 14531908.0, "step": 2644, "train/ce_loss": 0.4471637010574341 }, { "epoch": 0.2614198141190429, "step": 2644, "train/sim_loss": 0.06640625 }, { "epoch": 0.2614198141190429, "step": 2644, "train/total_loss": 0.11112262308597565 }, { "entropy": 8.87045669555664, "epoch": 0.26151868696855846, "mean_token_accuracy": 0.7854748368263245, "num_tokens": 14537403.0, "step": 2645, "train/ce_loss": 0.6283501386642456 }, { "epoch": 0.26151868696855846, "step": 2645, "train/sim_loss": 0.06640625 }, { "epoch": 0.26151868696855846, "step": 2645, "train/total_loss": 0.12924125790596008 }, { "entropy": 9.25125503540039, "epoch": 0.26161755981807394, "mean_token_accuracy": 0.7715404629707336, "num_tokens": 14542718.0, "step": 2646, "train/ce_loss": 0.6914810538291931 }, { "epoch": 0.26161755981807394, "step": 2646, "train/sim_loss": 0.04296875 }, { "epoch": 0.26161755981807394, "step": 2646, "train/total_loss": 0.11211685836315155 }, { "entropy": 8.939061164855957, "epoch": 0.2617164326675895, "mean_token_accuracy": 0.7417218685150146, "num_tokens": 14548227.0, "step": 2647, "train/ce_loss": 0.6087618470191956 }, { "epoch": 0.2617164326675895, "step": 2647, "train/sim_loss": 0.03125 }, { "epoch": 0.2617164326675895, "step": 2647, "train/total_loss": 0.09212619066238403 }, { "entropy": 8.848587036132812, "epoch": 0.261815305517105, "mean_token_accuracy": 0.7785160541534424, "num_tokens": 14553760.0, "step": 2648, "train/ce_loss": 0.8843837380409241 }, { "epoch": 0.261815305517105, "step": 2648, "train/sim_loss": 0.125 }, { "epoch": 0.261815305517105, "step": 2648, "train/total_loss": 0.21343837678432465 }, { "entropy": 8.171056747436523, "epoch": 0.2619141783666205, "mean_token_accuracy": 0.7079240083694458, "num_tokens": 14559817.0, "step": 2649, "train/ce_loss": 1.6964038610458374 }, { "epoch": 0.2619141783666205, "step": 2649, "train/sim_loss": 0.04296875 }, { "epoch": 0.2619141783666205, "step": 2649, "train/total_loss": 0.21260914206504822 }, { "entropy": 9.178388595581055, "epoch": 0.26201305121613605, "mean_token_accuracy": 0.7162629961967468, "num_tokens": 14565327.0, "step": 2650, "train/ce_loss": 1.8717738389968872 }, { "epoch": 0.26201305121613605, "step": 2650, "train/sim_loss": 0.10546875 }, { "epoch": 0.26201305121613605, "step": 2650, "train/total_loss": 0.2926461398601532 }, { "entropy": 8.66230297088623, "epoch": 0.2621119240656516, "mean_token_accuracy": 0.6815415620803833, "num_tokens": 14570929.0, "step": 2651, "train/ce_loss": 1.680589199066162 }, { "epoch": 0.2621119240656516, "step": 2651, "train/sim_loss": 0.06640625 }, { "epoch": 0.2621119240656516, "step": 2651, "train/total_loss": 0.23446516692638397 }, { "entropy": 8.985464096069336, "epoch": 0.2622107969151671, "mean_token_accuracy": 0.7277167439460754, "num_tokens": 14576422.0, "step": 2652, "train/ce_loss": 1.0887826681137085 }, { "epoch": 0.2622107969151671, "step": 2652, "train/sim_loss": 0.08203125 }, { "epoch": 0.2622107969151671, "step": 2652, "train/total_loss": 0.1909095197916031 }, { "entropy": 8.988216400146484, "epoch": 0.2623096697646826, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 14581938.0, "step": 2653, "train/ce_loss": 0.6794180870056152 }, { "epoch": 0.2623096697646826, "step": 2653, "train/sim_loss": 0.0625 }, { "epoch": 0.2623096697646826, "step": 2653, "train/total_loss": 0.130441814661026 }, { "entropy": 8.918237686157227, "epoch": 0.26240854261419816, "mean_token_accuracy": 0.7671673893928528, "num_tokens": 14587529.0, "step": 2654, "train/ce_loss": 0.5561239123344421 }, { "epoch": 0.26240854261419816, "step": 2654, "train/sim_loss": 0.08984375 }, { "epoch": 0.26240854261419816, "step": 2654, "train/total_loss": 0.14545613527297974 }, { "entropy": 8.79340934753418, "epoch": 0.26250741546371364, "mean_token_accuracy": 0.743929386138916, "num_tokens": 14593019.0, "step": 2655, "train/ce_loss": 0.5483924746513367 }, { "epoch": 0.26250741546371364, "step": 2655, "train/sim_loss": 0.06640625 }, { "epoch": 0.26250741546371364, "step": 2655, "train/total_loss": 0.12124550342559814 }, { "entropy": 8.97017765045166, "epoch": 0.2626062883132292, "mean_token_accuracy": 0.7548179626464844, "num_tokens": 14598608.0, "step": 2656, "train/ce_loss": 0.8941709399223328 }, { "epoch": 0.2626062883132292, "step": 2656, "train/sim_loss": 0.09375 }, { "epoch": 0.2626062883132292, "step": 2656, "train/total_loss": 0.18316709995269775 }, { "entropy": 9.037513732910156, "epoch": 0.2627051611627447, "mean_token_accuracy": 0.7105855941772461, "num_tokens": 14604101.0, "step": 2657, "train/ce_loss": 1.2742186784744263 }, { "epoch": 0.2627051611627447, "step": 2657, "train/sim_loss": 0.09375 }, { "epoch": 0.2627051611627447, "step": 2657, "train/total_loss": 0.22117187082767487 }, { "entropy": 9.164360046386719, "epoch": 0.2628040340122602, "mean_token_accuracy": 0.7477477192878723, "num_tokens": 14609491.0, "step": 2658, "train/ce_loss": 0.7844141125679016 }, { "epoch": 0.2628040340122602, "step": 2658, "train/sim_loss": 0.0625 }, { "epoch": 0.2628040340122602, "step": 2658, "train/total_loss": 0.14094141125679016 }, { "entropy": 9.089921951293945, "epoch": 0.26290290686177575, "mean_token_accuracy": 0.7194679379463196, "num_tokens": 14614957.0, "step": 2659, "train/ce_loss": 0.6877031922340393 }, { "epoch": 0.26290290686177575, "step": 2659, "train/sim_loss": 0.04296875 }, { "epoch": 0.26290290686177575, "step": 2659, "train/total_loss": 0.11173906922340393 }, { "epoch": 0.2630017797112913, "grad_norm": 0.784711480140686, "learning_rate": 9.345052662809673e-06, "loss": 0.144, "step": 2660 }, { "entropy": 8.978690147399902, "epoch": 0.2630017797112913, "mean_token_accuracy": 0.8069584965705872, "num_tokens": 14620481.0, "step": 2660, "train/ce_loss": 0.5908174514770508 }, { "epoch": 0.2630017797112913, "step": 2660, "train/sim_loss": 0.08984375 }, { "epoch": 0.2630017797112913, "step": 2660, "train/total_loss": 0.14892549812793732 }, { "entropy": 8.993728637695312, "epoch": 0.2631006525608068, "mean_token_accuracy": 0.6796116232872009, "num_tokens": 14625935.0, "step": 2661, "train/ce_loss": 0.6498512029647827 }, { "epoch": 0.2631006525608068, "step": 2661, "train/sim_loss": 0.08984375 }, { "epoch": 0.2631006525608068, "step": 2661, "train/total_loss": 0.15482887625694275 }, { "entropy": 8.765996932983398, "epoch": 0.2631995254103223, "mean_token_accuracy": 0.7556818127632141, "num_tokens": 14631648.0, "step": 2662, "train/ce_loss": 0.9376183748245239 }, { "epoch": 0.2631995254103223, "step": 2662, "train/sim_loss": 0.09375 }, { "epoch": 0.2631995254103223, "step": 2662, "train/total_loss": 0.18751183152198792 }, { "entropy": 8.987675666809082, "epoch": 0.26329839825983786, "mean_token_accuracy": 0.7719298005104065, "num_tokens": 14637178.0, "step": 2663, "train/ce_loss": 0.5294634699821472 }, { "epoch": 0.26329839825983786, "step": 2663, "train/sim_loss": 0.078125 }, { "epoch": 0.26329839825983786, "step": 2663, "train/total_loss": 0.13107134401798248 }, { "entropy": 9.327277183532715, "epoch": 0.26339727110935335, "mean_token_accuracy": 0.7597122192382812, "num_tokens": 14642510.0, "step": 2664, "train/ce_loss": 0.8666014671325684 }, { "epoch": 0.26339727110935335, "step": 2664, "train/sim_loss": 0.03515625 }, { "epoch": 0.26339727110935335, "step": 2664, "train/total_loss": 0.12181639671325684 }, { "entropy": 8.996562957763672, "epoch": 0.2634961439588689, "mean_token_accuracy": 0.6952264308929443, "num_tokens": 14648152.0, "step": 2665, "train/ce_loss": 0.44186878204345703 }, { "epoch": 0.2634961439588689, "step": 2665, "train/sim_loss": 0.078125 }, { "epoch": 0.2634961439588689, "step": 2665, "train/total_loss": 0.12231187522411346 }, { "entropy": 8.860501289367676, "epoch": 0.26359501680838443, "mean_token_accuracy": 0.6890052556991577, "num_tokens": 14653763.0, "step": 2666, "train/ce_loss": 1.3570289611816406 }, { "epoch": 0.26359501680838443, "step": 2666, "train/sim_loss": 0.1171875 }, { "epoch": 0.26359501680838443, "step": 2666, "train/total_loss": 0.252890408039093 }, { "entropy": 8.793676376342773, "epoch": 0.2636938896578999, "mean_token_accuracy": 0.7365438938140869, "num_tokens": 14659402.0, "step": 2667, "train/ce_loss": 0.3731904923915863 }, { "epoch": 0.2636938896578999, "step": 2667, "train/sim_loss": 0.05859375 }, { "epoch": 0.2636938896578999, "step": 2667, "train/total_loss": 0.09591279923915863 }, { "entropy": 8.987001419067383, "epoch": 0.26379276250741546, "mean_token_accuracy": 0.7397899627685547, "num_tokens": 14664875.0, "step": 2668, "train/ce_loss": 0.5416690707206726 }, { "epoch": 0.26379276250741546, "step": 2668, "train/sim_loss": 0.0859375 }, { "epoch": 0.26379276250741546, "step": 2668, "train/total_loss": 0.14010441303253174 }, { "entropy": 8.691656112670898, "epoch": 0.263891635356931, "mean_token_accuracy": 0.6838301420211792, "num_tokens": 14670580.0, "step": 2669, "train/ce_loss": 0.7663032412528992 }, { "epoch": 0.263891635356931, "step": 2669, "train/sim_loss": 0.09765625 }, { "epoch": 0.263891635356931, "step": 2669, "train/total_loss": 0.17428657412528992 }, { "entropy": 8.486310958862305, "epoch": 0.2639905082064465, "mean_token_accuracy": 0.7642928957939148, "num_tokens": 14676324.0, "step": 2670, "train/ce_loss": 0.6302361488342285 }, { "epoch": 0.2639905082064465, "step": 2670, "train/sim_loss": 0.140625 }, { "epoch": 0.2639905082064465, "step": 2670, "train/total_loss": 0.2036486268043518 }, { "entropy": 9.108469009399414, "epoch": 0.264089381055962, "mean_token_accuracy": 0.8039906024932861, "num_tokens": 14681751.0, "step": 2671, "train/ce_loss": 0.36568698287010193 }, { "epoch": 0.264089381055962, "step": 2671, "train/sim_loss": 0.01953125 }, { "epoch": 0.264089381055962, "step": 2671, "train/total_loss": 0.05609994754195213 }, { "entropy": 8.92342758178711, "epoch": 0.26418825390547757, "mean_token_accuracy": 0.7696447968482971, "num_tokens": 14687224.0, "step": 2672, "train/ce_loss": 1.306592345237732 }, { "epoch": 0.26418825390547757, "step": 2672, "train/sim_loss": 0.08203125 }, { "epoch": 0.26418825390547757, "step": 2672, "train/total_loss": 0.21269048750400543 }, { "entropy": 9.0706148147583, "epoch": 0.26428712675499305, "mean_token_accuracy": 0.7006711363792419, "num_tokens": 14692619.0, "step": 2673, "train/ce_loss": 1.3585960865020752 }, { "epoch": 0.26428712675499305, "step": 2673, "train/sim_loss": 0.125 }, { "epoch": 0.26428712675499305, "step": 2673, "train/total_loss": 0.2608596086502075 }, { "entropy": 8.940422058105469, "epoch": 0.2643859996045086, "mean_token_accuracy": 0.745976984500885, "num_tokens": 14698087.0, "step": 2674, "train/ce_loss": 0.4495199918746948 }, { "epoch": 0.2643859996045086, "step": 2674, "train/sim_loss": 0.0390625 }, { "epoch": 0.2643859996045086, "step": 2674, "train/total_loss": 0.08401450514793396 }, { "entropy": 9.30974006652832, "epoch": 0.26448487245402413, "mean_token_accuracy": 0.7112069129943848, "num_tokens": 14703315.0, "step": 2675, "train/ce_loss": 0.8225477337837219 }, { "epoch": 0.26448487245402413, "step": 2675, "train/sim_loss": 0.0625 }, { "epoch": 0.26448487245402413, "step": 2675, "train/total_loss": 0.14475476741790771 }, { "entropy": 8.803112983703613, "epoch": 0.2645837453035397, "mean_token_accuracy": 0.7549530863761902, "num_tokens": 14708938.0, "step": 2676, "train/ce_loss": 1.3184900283813477 }, { "epoch": 0.2645837453035397, "step": 2676, "train/sim_loss": 0.12109375 }, { "epoch": 0.2645837453035397, "step": 2676, "train/total_loss": 0.2529427409172058 }, { "entropy": 9.118158340454102, "epoch": 0.26468261815305516, "mean_token_accuracy": 0.7345013618469238, "num_tokens": 14714328.0, "step": 2677, "train/ce_loss": 0.8359856009483337 }, { "epoch": 0.26468261815305516, "step": 2677, "train/sim_loss": 0.078125 }, { "epoch": 0.26468261815305516, "step": 2677, "train/total_loss": 0.1617235541343689 }, { "entropy": 8.906831741333008, "epoch": 0.2647814910025707, "mean_token_accuracy": 0.7752928733825684, "num_tokens": 14719910.0, "step": 2678, "train/ce_loss": 0.37537187337875366 }, { "epoch": 0.2647814910025707, "step": 2678, "train/sim_loss": 0.07421875 }, { "epoch": 0.2647814910025707, "step": 2678, "train/total_loss": 0.11175593733787537 }, { "entropy": 8.67770767211914, "epoch": 0.26488036385208624, "mean_token_accuracy": 0.7775847315788269, "num_tokens": 14725708.0, "step": 2679, "train/ce_loss": 0.6100906729698181 }, { "epoch": 0.26488036385208624, "step": 2679, "train/sim_loss": 0.04296875 }, { "epoch": 0.26488036385208624, "step": 2679, "train/total_loss": 0.10397781431674957 }, { "epoch": 0.26497923670160173, "grad_norm": 0.7383280992507935, "learning_rate": 9.340107798051723e-06, "loss": 0.1504, "step": 2680 }, { "entropy": 9.050519943237305, "epoch": 0.26497923670160173, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 14731274.0, "step": 2680, "train/ce_loss": 0.534028172492981 }, { "epoch": 0.26497923670160173, "step": 2680, "train/sim_loss": 0.06640625 }, { "epoch": 0.26497923670160173, "step": 2680, "train/total_loss": 0.11980906873941422 }, { "entropy": 9.06901741027832, "epoch": 0.26507810955111727, "mean_token_accuracy": 0.7760663628578186, "num_tokens": 14736699.0, "step": 2681, "train/ce_loss": 0.6287824511528015 }, { "epoch": 0.26507810955111727, "step": 2681, "train/sim_loss": 0.05078125 }, { "epoch": 0.26507810955111727, "step": 2681, "train/total_loss": 0.11365949362516403 }, { "entropy": 8.911203384399414, "epoch": 0.2651769824006328, "mean_token_accuracy": 0.7098121047019958, "num_tokens": 14742464.0, "step": 2682, "train/ce_loss": 1.2989046573638916 }, { "epoch": 0.2651769824006328, "step": 2682, "train/sim_loss": 0.09375 }, { "epoch": 0.2651769824006328, "step": 2682, "train/total_loss": 0.22364047169685364 }, { "entropy": 8.977703094482422, "epoch": 0.2652758552501483, "mean_token_accuracy": 0.7216748595237732, "num_tokens": 14747899.0, "step": 2683, "train/ce_loss": 0.8302895426750183 }, { "epoch": 0.2652758552501483, "step": 2683, "train/sim_loss": 0.0625 }, { "epoch": 0.2652758552501483, "step": 2683, "train/total_loss": 0.14552895724773407 }, { "entropy": 9.069273948669434, "epoch": 0.26537472809966384, "mean_token_accuracy": 0.679347813129425, "num_tokens": 14753439.0, "step": 2684, "train/ce_loss": 0.8567643165588379 }, { "epoch": 0.26537472809966384, "step": 2684, "train/sim_loss": 0.0546875 }, { "epoch": 0.26537472809966384, "step": 2684, "train/total_loss": 0.1403639316558838 }, { "entropy": 8.86003303527832, "epoch": 0.2654736009491794, "mean_token_accuracy": 0.7538610100746155, "num_tokens": 14759060.0, "step": 2685, "train/ce_loss": 0.6752564907073975 }, { "epoch": 0.2654736009491794, "step": 2685, "train/sim_loss": 0.01953125 }, { "epoch": 0.2654736009491794, "step": 2685, "train/total_loss": 0.08705689758062363 }, { "entropy": 8.579933166503906, "epoch": 0.26557247379869486, "mean_token_accuracy": 0.7341772317886353, "num_tokens": 14764785.0, "step": 2686, "train/ce_loss": 0.46655747294425964 }, { "epoch": 0.26557247379869486, "step": 2686, "train/sim_loss": 0.015625 }, { "epoch": 0.26557247379869486, "step": 2686, "train/total_loss": 0.062280748039484024 }, { "entropy": 9.11992073059082, "epoch": 0.2656713466482104, "mean_token_accuracy": 0.7628361582756042, "num_tokens": 14770230.0, "step": 2687, "train/ce_loss": 0.5032975077629089 }, { "epoch": 0.2656713466482104, "step": 2687, "train/sim_loss": 0.05078125 }, { "epoch": 0.2656713466482104, "step": 2687, "train/total_loss": 0.10111100226640701 }, { "entropy": 9.169716835021973, "epoch": 0.26577021949772595, "mean_token_accuracy": 0.7226994037628174, "num_tokens": 14775662.0, "step": 2688, "train/ce_loss": 0.5676220655441284 }, { "epoch": 0.26577021949772595, "step": 2688, "train/sim_loss": 0.0234375 }, { "epoch": 0.26577021949772595, "step": 2688, "train/total_loss": 0.0801997035741806 }, { "entropy": 8.658411026000977, "epoch": 0.26586909234724143, "mean_token_accuracy": 0.6962264180183411, "num_tokens": 14781266.0, "step": 2689, "train/ce_loss": 0.5439620018005371 }, { "epoch": 0.26586909234724143, "step": 2689, "train/sim_loss": 0.0546875 }, { "epoch": 0.26586909234724143, "step": 2689, "train/total_loss": 0.10908369719982147 }, { "entropy": 8.794584274291992, "epoch": 0.265967965196757, "mean_token_accuracy": 0.7287644743919373, "num_tokens": 14786916.0, "step": 2690, "train/ce_loss": 1.5545594692230225 }, { "epoch": 0.265967965196757, "step": 2690, "train/sim_loss": 0.0390625 }, { "epoch": 0.265967965196757, "step": 2690, "train/total_loss": 0.19451844692230225 }, { "entropy": 8.924946784973145, "epoch": 0.2660668380462725, "mean_token_accuracy": 0.7931416034698486, "num_tokens": 14792443.0, "step": 2691, "train/ce_loss": 0.6684203147888184 }, { "epoch": 0.2660668380462725, "step": 2691, "train/sim_loss": 0.109375 }, { "epoch": 0.2660668380462725, "step": 2691, "train/total_loss": 0.17621703445911407 }, { "entropy": 8.95544147491455, "epoch": 0.266165710895788, "mean_token_accuracy": 0.6987951993942261, "num_tokens": 14797777.0, "step": 2692, "train/ce_loss": 0.5514764785766602 }, { "epoch": 0.266165710895788, "step": 2692, "train/sim_loss": 0.04296875 }, { "epoch": 0.266165710895788, "step": 2692, "train/total_loss": 0.09811639785766602 }, { "entropy": 8.600324630737305, "epoch": 0.26626458374530354, "mean_token_accuracy": 0.7394540905952454, "num_tokens": 14803562.0, "step": 2693, "train/ce_loss": 1.3820140361785889 }, { "epoch": 0.26626458374530354, "step": 2693, "train/sim_loss": 0.046875 }, { "epoch": 0.26626458374530354, "step": 2693, "train/total_loss": 0.18507640063762665 }, { "entropy": 9.299391746520996, "epoch": 0.2663634565948191, "mean_token_accuracy": 0.7120668888092041, "num_tokens": 14809007.0, "step": 2694, "train/ce_loss": 0.3392462432384491 }, { "epoch": 0.2663634565948191, "step": 2694, "train/sim_loss": 0.078125 }, { "epoch": 0.2663634565948191, "step": 2694, "train/total_loss": 0.11204962432384491 }, { "entropy": 8.796304702758789, "epoch": 0.26646232944433457, "mean_token_accuracy": 0.7497392892837524, "num_tokens": 14814646.0, "step": 2695, "train/ce_loss": 0.9220964908599854 }, { "epoch": 0.26646232944433457, "step": 2695, "train/sim_loss": 0.0703125 }, { "epoch": 0.26646232944433457, "step": 2695, "train/total_loss": 0.16252215206623077 }, { "entropy": 8.8816556930542, "epoch": 0.2665612022938501, "mean_token_accuracy": 0.769729733467102, "num_tokens": 14820127.0, "step": 2696, "train/ce_loss": 0.35971733927726746 }, { "epoch": 0.2665612022938501, "step": 2696, "train/sim_loss": 0.046875 }, { "epoch": 0.2665612022938501, "step": 2696, "train/total_loss": 0.0828467309474945 }, { "entropy": 9.200408935546875, "epoch": 0.26666007514336565, "mean_token_accuracy": 0.7217165231704712, "num_tokens": 14825508.0, "step": 2697, "train/ce_loss": 0.6260414719581604 }, { "epoch": 0.26666007514336565, "step": 2697, "train/sim_loss": 0.02734375 }, { "epoch": 0.26666007514336565, "step": 2697, "train/total_loss": 0.0899479016661644 }, { "entropy": 9.054214477539062, "epoch": 0.26675894799288113, "mean_token_accuracy": 0.7565714120864868, "num_tokens": 14831025.0, "step": 2698, "train/ce_loss": 1.1504110097885132 }, { "epoch": 0.26675894799288113, "step": 2698, "train/sim_loss": 0.1171875 }, { "epoch": 0.26675894799288113, "step": 2698, "train/total_loss": 0.2322286069393158 }, { "entropy": 8.845863342285156, "epoch": 0.2668578208423967, "mean_token_accuracy": 0.7335216403007507, "num_tokens": 14836649.0, "step": 2699, "train/ce_loss": 1.0104111433029175 }, { "epoch": 0.2668578208423967, "step": 2699, "train/sim_loss": 0.0390625 }, { "epoch": 0.2668578208423967, "step": 2699, "train/total_loss": 0.14010360836982727 }, { "epoch": 0.2669566936919122, "grad_norm": 0.6560090184211731, "learning_rate": 9.335162933293776e-06, "loss": 0.1534, "step": 2700 }, { "entropy": 9.051287651062012, "epoch": 0.2669566936919122, "mean_token_accuracy": 0.7271573543548584, "num_tokens": 14842073.0, "step": 2700, "train/ce_loss": 0.4390254616737366 }, { "epoch": 0.2669566936919122, "step": 2700, "train/sim_loss": 0.046875 }, { "epoch": 0.2669566936919122, "step": 2700, "train/total_loss": 0.09077754616737366 }, { "entropy": 9.133369445800781, "epoch": 0.2670555665414277, "mean_token_accuracy": 0.7234042286872864, "num_tokens": 14847399.0, "step": 2701, "train/ce_loss": 1.0351208448410034 }, { "epoch": 0.2670555665414277, "step": 2701, "train/sim_loss": 0.0625 }, { "epoch": 0.2670555665414277, "step": 2701, "train/total_loss": 0.16601207852363586 }, { "entropy": 9.219060897827148, "epoch": 0.26715443939094324, "mean_token_accuracy": 0.829959511756897, "num_tokens": 14852778.0, "step": 2702, "train/ce_loss": 0.5218618512153625 }, { "epoch": 0.26715443939094324, "step": 2702, "train/sim_loss": 0.08203125 }, { "epoch": 0.26715443939094324, "step": 2702, "train/total_loss": 0.13421744108200073 }, { "entropy": 8.842876434326172, "epoch": 0.2672533122404588, "mean_token_accuracy": 0.6911764740943909, "num_tokens": 14858320.0, "step": 2703, "train/ce_loss": 0.8240863084793091 }, { "epoch": 0.2672533122404588, "step": 2703, "train/sim_loss": 0.04296875 }, { "epoch": 0.2672533122404588, "step": 2703, "train/total_loss": 0.12537738680839539 }, { "entropy": 8.976465225219727, "epoch": 0.26735218508997427, "mean_token_accuracy": 0.7563804984092712, "num_tokens": 14863838.0, "step": 2704, "train/ce_loss": 0.80218106508255 }, { "epoch": 0.26735218508997427, "step": 2704, "train/sim_loss": 0.1171875 }, { "epoch": 0.26735218508997427, "step": 2704, "train/total_loss": 0.197405606508255 }, { "entropy": 8.803754806518555, "epoch": 0.2674510579394898, "mean_token_accuracy": 0.7242646813392639, "num_tokens": 14869371.0, "step": 2705, "train/ce_loss": 0.6698227524757385 }, { "epoch": 0.2674510579394898, "step": 2705, "train/sim_loss": 0.0859375 }, { "epoch": 0.2674510579394898, "step": 2705, "train/total_loss": 0.15291976928710938 }, { "entropy": 8.938246726989746, "epoch": 0.26754993078900535, "mean_token_accuracy": 0.7036247253417969, "num_tokens": 14874988.0, "step": 2706, "train/ce_loss": 1.1774961948394775 }, { "epoch": 0.26754993078900535, "step": 2706, "train/sim_loss": 0.06640625 }, { "epoch": 0.26754993078900535, "step": 2706, "train/total_loss": 0.1841558814048767 }, { "entropy": 8.951648712158203, "epoch": 0.26764880363852084, "mean_token_accuracy": 0.7246376872062683, "num_tokens": 14880457.0, "step": 2707, "train/ce_loss": 1.0399614572525024 }, { "epoch": 0.26764880363852084, "step": 2707, "train/sim_loss": 0.10546875 }, { "epoch": 0.26764880363852084, "step": 2707, "train/total_loss": 0.2094649076461792 }, { "entropy": 8.659244537353516, "epoch": 0.2677476764880364, "mean_token_accuracy": 0.7544517517089844, "num_tokens": 14886158.0, "step": 2708, "train/ce_loss": 0.6358906626701355 }, { "epoch": 0.2677476764880364, "step": 2708, "train/sim_loss": 0.0234375 }, { "epoch": 0.2677476764880364, "step": 2708, "train/total_loss": 0.08702656626701355 }, { "entropy": 8.61720085144043, "epoch": 0.2678465493375519, "mean_token_accuracy": 0.7120938897132874, "num_tokens": 14891889.0, "step": 2709, "train/ce_loss": 0.8955880999565125 }, { "epoch": 0.2678465493375519, "step": 2709, "train/sim_loss": 0.08984375 }, { "epoch": 0.2678465493375519, "step": 2709, "train/total_loss": 0.17940255999565125 }, { "entropy": 9.279003143310547, "epoch": 0.2679454221870674, "mean_token_accuracy": 0.7877013087272644, "num_tokens": 14897162.0, "step": 2710, "train/ce_loss": 0.48402515053749084 }, { "epoch": 0.2679454221870674, "step": 2710, "train/sim_loss": 0.02734375 }, { "epoch": 0.2679454221870674, "step": 2710, "train/total_loss": 0.07574626803398132 }, { "entropy": 8.872784614562988, "epoch": 0.26804429503658295, "mean_token_accuracy": 0.7344110608100891, "num_tokens": 14902682.0, "step": 2711, "train/ce_loss": 0.6896637082099915 }, { "epoch": 0.26804429503658295, "step": 2711, "train/sim_loss": 0.0703125 }, { "epoch": 0.26804429503658295, "step": 2711, "train/total_loss": 0.13927887380123138 }, { "entropy": 9.201529502868652, "epoch": 0.2681431678860985, "mean_token_accuracy": 0.7699724435806274, "num_tokens": 14908000.0, "step": 2712, "train/ce_loss": 0.579657793045044 }, { "epoch": 0.2681431678860985, "step": 2712, "train/sim_loss": 0.05859375 }, { "epoch": 0.2681431678860985, "step": 2712, "train/total_loss": 0.11655953526496887 }, { "entropy": 8.752439498901367, "epoch": 0.268242040735614, "mean_token_accuracy": 0.7400932312011719, "num_tokens": 14913482.0, "step": 2713, "train/ce_loss": 0.4654117822647095 }, { "epoch": 0.268242040735614, "step": 2713, "train/sim_loss": 0.02734375 }, { "epoch": 0.268242040735614, "step": 2713, "train/total_loss": 0.07388493418693542 }, { "entropy": 8.865167617797852, "epoch": 0.2683409135851295, "mean_token_accuracy": 0.774678111076355, "num_tokens": 14918996.0, "step": 2714, "train/ce_loss": 1.167525053024292 }, { "epoch": 0.2683409135851295, "step": 2714, "train/sim_loss": 0.05078125 }, { "epoch": 0.2683409135851295, "step": 2714, "train/total_loss": 0.1675337553024292 }, { "entropy": 8.691970825195312, "epoch": 0.26843978643464506, "mean_token_accuracy": 0.7458292245864868, "num_tokens": 14924655.0, "step": 2715, "train/ce_loss": 0.7261674404144287 }, { "epoch": 0.26843978643464506, "step": 2715, "train/sim_loss": 0.03515625 }, { "epoch": 0.26843978643464506, "step": 2715, "train/total_loss": 0.10777299851179123 }, { "entropy": 9.03242015838623, "epoch": 0.2685386592841606, "mean_token_accuracy": 0.7351874113082886, "num_tokens": 14930102.0, "step": 2716, "train/ce_loss": 0.4655354917049408 }, { "epoch": 0.2685386592841606, "step": 2716, "train/sim_loss": 0.078125 }, { "epoch": 0.2685386592841606, "step": 2716, "train/total_loss": 0.12467855215072632 }, { "entropy": 8.719528198242188, "epoch": 0.2686375321336761, "mean_token_accuracy": 0.78922039270401, "num_tokens": 14935748.0, "step": 2717, "train/ce_loss": 0.5386845469474792 }, { "epoch": 0.2686375321336761, "step": 2717, "train/sim_loss": 0.01171875 }, { "epoch": 0.2686375321336761, "step": 2717, "train/total_loss": 0.06558720767498016 }, { "entropy": 8.651636123657227, "epoch": 0.2687364049831916, "mean_token_accuracy": 0.7008628845214844, "num_tokens": 14941372.0, "step": 2718, "train/ce_loss": 0.6573624610900879 }, { "epoch": 0.2687364049831916, "step": 2718, "train/sim_loss": 0.05859375 }, { "epoch": 0.2687364049831916, "step": 2718, "train/total_loss": 0.12432999908924103 }, { "entropy": 8.971601486206055, "epoch": 0.26883527783270716, "mean_token_accuracy": 0.6649076342582703, "num_tokens": 14946794.0, "step": 2719, "train/ce_loss": 0.6810231804847717 }, { "epoch": 0.26883527783270716, "step": 2719, "train/sim_loss": 0.05078125 }, { "epoch": 0.26883527783270716, "step": 2719, "train/total_loss": 0.11888357251882553 }, { "epoch": 0.26893415068222265, "grad_norm": 1.079055905342102, "learning_rate": 9.330218068535826e-06, "loss": 0.1481, "step": 2720 }, { "entropy": 8.87565803527832, "epoch": 0.26893415068222265, "mean_token_accuracy": 0.7371244430541992, "num_tokens": 14952305.0, "step": 2720, "train/ce_loss": 0.4598577320575714 }, { "epoch": 0.26893415068222265, "step": 2720, "train/sim_loss": 0.0390625 }, { "epoch": 0.26893415068222265, "step": 2720, "train/total_loss": 0.08504827320575714 }, { "entropy": 8.504154205322266, "epoch": 0.2690330235317382, "mean_token_accuracy": 0.7707070708274841, "num_tokens": 14957956.0, "step": 2721, "train/ce_loss": 0.762738823890686 }, { "epoch": 0.2690330235317382, "step": 2721, "train/sim_loss": 0.08984375 }, { "epoch": 0.2690330235317382, "step": 2721, "train/total_loss": 0.16611763834953308 }, { "entropy": 9.078775405883789, "epoch": 0.26913189638125373, "mean_token_accuracy": 0.7214885950088501, "num_tokens": 14963463.0, "step": 2722, "train/ce_loss": 0.32277819514274597 }, { "epoch": 0.26913189638125373, "step": 2722, "train/sim_loss": 0.05859375 }, { "epoch": 0.26913189638125373, "step": 2722, "train/total_loss": 0.09087157249450684 }, { "entropy": 8.88409423828125, "epoch": 0.2692307692307692, "mean_token_accuracy": 0.748083233833313, "num_tokens": 14968963.0, "step": 2723, "train/ce_loss": 1.2103931903839111 }, { "epoch": 0.2692307692307692, "step": 2723, "train/sim_loss": 0.04296875 }, { "epoch": 0.2692307692307692, "step": 2723, "train/total_loss": 0.16400808095932007 }, { "entropy": 9.039630889892578, "epoch": 0.26932964208028476, "mean_token_accuracy": 0.7546239495277405, "num_tokens": 14974620.0, "step": 2724, "train/ce_loss": 0.6636039614677429 }, { "epoch": 0.26932964208028476, "step": 2724, "train/sim_loss": 0.06640625 }, { "epoch": 0.26932964208028476, "step": 2724, "train/total_loss": 0.13276664912700653 }, { "entropy": 9.012276649475098, "epoch": 0.2694285149298003, "mean_token_accuracy": 0.7952380776405334, "num_tokens": 14980056.0, "step": 2725, "train/ce_loss": 0.5173184275627136 }, { "epoch": 0.2694285149298003, "step": 2725, "train/sim_loss": 0.0703125 }, { "epoch": 0.2694285149298003, "step": 2725, "train/total_loss": 0.12204433977603912 }, { "entropy": 9.227531433105469, "epoch": 0.2695273877793158, "mean_token_accuracy": 0.7395833134651184, "num_tokens": 14985435.0, "step": 2726, "train/ce_loss": 0.6398794054985046 }, { "epoch": 0.2695273877793158, "step": 2726, "train/sim_loss": 0.06640625 }, { "epoch": 0.2695273877793158, "step": 2726, "train/total_loss": 0.13039419054985046 }, { "entropy": 8.933124542236328, "epoch": 0.2696262606288313, "mean_token_accuracy": 0.7510729432106018, "num_tokens": 14991038.0, "step": 2727, "train/ce_loss": 0.6987532377243042 }, { "epoch": 0.2696262606288313, "step": 2727, "train/sim_loss": 0.0703125 }, { "epoch": 0.2696262606288313, "step": 2727, "train/total_loss": 0.1401878297328949 }, { "entropy": 9.036188125610352, "epoch": 0.26972513347834687, "mean_token_accuracy": 0.7734994292259216, "num_tokens": 14996505.0, "step": 2728, "train/ce_loss": 0.5172072649002075 }, { "epoch": 0.26972513347834687, "step": 2728, "train/sim_loss": 0.02734375 }, { "epoch": 0.26972513347834687, "step": 2728, "train/total_loss": 0.07906447350978851 }, { "entropy": 8.856854438781738, "epoch": 0.26982400632786235, "mean_token_accuracy": 0.7321652173995972, "num_tokens": 15001964.0, "step": 2729, "train/ce_loss": 0.5854946970939636 }, { "epoch": 0.26982400632786235, "step": 2729, "train/sim_loss": 0.078125 }, { "epoch": 0.26982400632786235, "step": 2729, "train/total_loss": 0.13667446374893188 }, { "entropy": 8.942659378051758, "epoch": 0.2699228791773779, "mean_token_accuracy": 0.7486398220062256, "num_tokens": 15007446.0, "step": 2730, "train/ce_loss": 0.5912684798240662 }, { "epoch": 0.2699228791773779, "step": 2730, "train/sim_loss": 0.08984375 }, { "epoch": 0.2699228791773779, "step": 2730, "train/total_loss": 0.1489706039428711 }, { "entropy": 8.917900085449219, "epoch": 0.27002175202689344, "mean_token_accuracy": 0.7491165995597839, "num_tokens": 15013100.0, "step": 2731, "train/ce_loss": 0.6418513655662537 }, { "epoch": 0.27002175202689344, "step": 2731, "train/sim_loss": 0.0234375 }, { "epoch": 0.27002175202689344, "step": 2731, "train/total_loss": 0.08762263506650925 }, { "entropy": 8.988201141357422, "epoch": 0.2701206248764089, "mean_token_accuracy": 0.752525269985199, "num_tokens": 15018598.0, "step": 2732, "train/ce_loss": 0.6657642126083374 }, { "epoch": 0.2701206248764089, "step": 2732, "train/sim_loss": 0.08203125 }, { "epoch": 0.2701206248764089, "step": 2732, "train/total_loss": 0.14860767126083374 }, { "entropy": 9.203808784484863, "epoch": 0.27021949772592446, "mean_token_accuracy": 0.7160161733627319, "num_tokens": 15023920.0, "step": 2733, "train/ce_loss": 1.135412573814392 }, { "epoch": 0.27021949772592446, "step": 2733, "train/sim_loss": 0.09375 }, { "epoch": 0.27021949772592446, "step": 2733, "train/total_loss": 0.20729126036167145 }, { "entropy": 8.908343315124512, "epoch": 0.27031837057544, "mean_token_accuracy": 0.7526132464408875, "num_tokens": 15029456.0, "step": 2734, "train/ce_loss": 1.2018240690231323 }, { "epoch": 0.27031837057544, "step": 2734, "train/sim_loss": 0.04296875 }, { "epoch": 0.27031837057544, "step": 2734, "train/total_loss": 0.16315115988254547 }, { "entropy": 9.14079761505127, "epoch": 0.2704172434249555, "mean_token_accuracy": 0.7528517246246338, "num_tokens": 15035065.0, "step": 2735, "train/ce_loss": 0.6982367038726807 }, { "epoch": 0.2704172434249555, "step": 2735, "train/sim_loss": 0.05078125 }, { "epoch": 0.2704172434249555, "step": 2735, "train/total_loss": 0.12060492485761642 }, { "entropy": 9.20100212097168, "epoch": 0.27051611627447103, "mean_token_accuracy": 0.7219387888908386, "num_tokens": 15040449.0, "step": 2736, "train/ce_loss": 0.6586173176765442 }, { "epoch": 0.27051611627447103, "step": 2736, "train/sim_loss": 0.05078125 }, { "epoch": 0.27051611627447103, "step": 2736, "train/total_loss": 0.11664298176765442 }, { "entropy": 8.749832153320312, "epoch": 0.27061498912398657, "mean_token_accuracy": 0.7700421810150146, "num_tokens": 15046045.0, "step": 2737, "train/ce_loss": 0.717591404914856 }, { "epoch": 0.27061498912398657, "step": 2737, "train/sim_loss": 0.0703125 }, { "epoch": 0.27061498912398657, "step": 2737, "train/total_loss": 0.14207163453102112 }, { "entropy": 8.37519645690918, "epoch": 0.27071386197350206, "mean_token_accuracy": 0.6770833134651184, "num_tokens": 15051825.0, "step": 2738, "train/ce_loss": 0.9609323143959045 }, { "epoch": 0.27071386197350206, "step": 2738, "train/sim_loss": 0.26171875 }, { "epoch": 0.27071386197350206, "step": 2738, "train/total_loss": 0.35781198740005493 }, { "entropy": 9.138771057128906, "epoch": 0.2708127348230176, "mean_token_accuracy": 0.767002522945404, "num_tokens": 15057244.0, "step": 2739, "train/ce_loss": 0.3785850405693054 }, { "epoch": 0.2708127348230176, "step": 2739, "train/sim_loss": 0.0625 }, { "epoch": 0.2708127348230176, "step": 2739, "train/total_loss": 0.1003585010766983 }, { "epoch": 0.27091160767253314, "grad_norm": 0.8231661319732666, "learning_rate": 9.325273203777877e-06, "loss": 0.1489, "step": 2740 }, { "entropy": 9.35253620147705, "epoch": 0.27091160767253314, "mean_token_accuracy": 0.6990553140640259, "num_tokens": 15062660.0, "step": 2740, "train/ce_loss": 0.6039444208145142 }, { "epoch": 0.27091160767253314, "step": 2740, "train/sim_loss": 0.10546875 }, { "epoch": 0.27091160767253314, "step": 2740, "train/total_loss": 0.16586318612098694 }, { "entropy": 9.080060958862305, "epoch": 0.2710104805220486, "mean_token_accuracy": 0.781883180141449, "num_tokens": 15068122.0, "step": 2741, "train/ce_loss": 0.7583499550819397 }, { "epoch": 0.2710104805220486, "step": 2741, "train/sim_loss": 0.06640625 }, { "epoch": 0.2710104805220486, "step": 2741, "train/total_loss": 0.1422412395477295 }, { "entropy": 8.972156524658203, "epoch": 0.27110935337156417, "mean_token_accuracy": 0.8104925155639648, "num_tokens": 15073603.0, "step": 2742, "train/ce_loss": 0.5797200202941895 }, { "epoch": 0.27110935337156417, "step": 2742, "train/sim_loss": 0.0234375 }, { "epoch": 0.27110935337156417, "step": 2742, "train/total_loss": 0.0814094990491867 }, { "entropy": 8.869386672973633, "epoch": 0.2712082262210797, "mean_token_accuracy": 0.7261083722114563, "num_tokens": 15079321.0, "step": 2743, "train/ce_loss": 0.889859676361084 }, { "epoch": 0.2712082262210797, "step": 2743, "train/sim_loss": 0.1015625 }, { "epoch": 0.2712082262210797, "step": 2743, "train/total_loss": 0.19054847955703735 }, { "entropy": 8.967570304870605, "epoch": 0.2713070990705952, "mean_token_accuracy": 0.7068771123886108, "num_tokens": 15084797.0, "step": 2744, "train/ce_loss": 0.7855383157730103 }, { "epoch": 0.2713070990705952, "step": 2744, "train/sim_loss": 0.09765625 }, { "epoch": 0.2713070990705952, "step": 2744, "train/total_loss": 0.17621007561683655 }, { "entropy": 9.16016960144043, "epoch": 0.27140597192011073, "mean_token_accuracy": 0.745972752571106, "num_tokens": 15090257.0, "step": 2745, "train/ce_loss": 0.6962769627571106 }, { "epoch": 0.27140597192011073, "step": 2745, "train/sim_loss": 0.12109375 }, { "epoch": 0.27140597192011073, "step": 2745, "train/total_loss": 0.19072145223617554 }, { "entropy": 8.618914604187012, "epoch": 0.2715048447696263, "mean_token_accuracy": 0.7141444087028503, "num_tokens": 15095882.0, "step": 2746, "train/ce_loss": 0.8108847141265869 }, { "epoch": 0.2715048447696263, "step": 2746, "train/sim_loss": 0.04296875 }, { "epoch": 0.2715048447696263, "step": 2746, "train/total_loss": 0.12405722588300705 }, { "entropy": 8.869425773620605, "epoch": 0.27160371761914176, "mean_token_accuracy": 0.7508161067962646, "num_tokens": 15101307.0, "step": 2747, "train/ce_loss": 0.801207959651947 }, { "epoch": 0.27160371761914176, "step": 2747, "train/sim_loss": 0.046875 }, { "epoch": 0.27160371761914176, "step": 2747, "train/total_loss": 0.12699580192565918 }, { "entropy": 8.908367156982422, "epoch": 0.2717025904686573, "mean_token_accuracy": 0.7318918704986572, "num_tokens": 15106856.0, "step": 2748, "train/ce_loss": 0.9877061247825623 }, { "epoch": 0.2717025904686573, "step": 2748, "train/sim_loss": 0.078125 }, { "epoch": 0.2717025904686573, "step": 2748, "train/total_loss": 0.1768956184387207 }, { "entropy": 8.780250549316406, "epoch": 0.27180146331817284, "mean_token_accuracy": 0.7776679992675781, "num_tokens": 15112451.0, "step": 2749, "train/ce_loss": 0.9847418069839478 }, { "epoch": 0.27180146331817284, "step": 2749, "train/sim_loss": 0.11328125 }, { "epoch": 0.27180146331817284, "step": 2749, "train/total_loss": 0.2117554247379303 }, { "entropy": 8.808507919311523, "epoch": 0.27190033616768833, "mean_token_accuracy": 0.7421568632125854, "num_tokens": 15118061.0, "step": 2750, "train/ce_loss": 1.0712966918945312 }, { "epoch": 0.27190033616768833, "step": 2750, "train/sim_loss": 0.046875 }, { "epoch": 0.27190033616768833, "step": 2750, "train/total_loss": 0.15400466322898865 }, { "entropy": 8.676015853881836, "epoch": 0.27199920901720387, "mean_token_accuracy": 0.7683896422386169, "num_tokens": 15123739.0, "step": 2751, "train/ce_loss": 0.702955424785614 }, { "epoch": 0.27199920901720387, "step": 2751, "train/sim_loss": 0.078125 }, { "epoch": 0.27199920901720387, "step": 2751, "train/total_loss": 0.1484205424785614 }, { "entropy": 8.918163299560547, "epoch": 0.2720980818667194, "mean_token_accuracy": 0.7480663061141968, "num_tokens": 15129266.0, "step": 2752, "train/ce_loss": 0.8123002648353577 }, { "epoch": 0.2720980818667194, "step": 2752, "train/sim_loss": 0.0625 }, { "epoch": 0.2720980818667194, "step": 2752, "train/total_loss": 0.143730029463768 }, { "entropy": 9.06640338897705, "epoch": 0.2721969547162349, "mean_token_accuracy": 0.8041112422943115, "num_tokens": 15134681.0, "step": 2753, "train/ce_loss": 0.5322244763374329 }, { "epoch": 0.2721969547162349, "step": 2753, "train/sim_loss": 0.0234375 }, { "epoch": 0.2721969547162349, "step": 2753, "train/total_loss": 0.07665994763374329 }, { "entropy": 8.878562927246094, "epoch": 0.27229582756575044, "mean_token_accuracy": 0.7239819169044495, "num_tokens": 15140222.0, "step": 2754, "train/ce_loss": 0.4012081027030945 }, { "epoch": 0.27229582756575044, "step": 2754, "train/sim_loss": 0.078125 }, { "epoch": 0.27229582756575044, "step": 2754, "train/total_loss": 0.11824581027030945 }, { "entropy": 8.273828506469727, "epoch": 0.272394700415266, "mean_token_accuracy": 0.7302231192588806, "num_tokens": 15146207.0, "step": 2755, "train/ce_loss": 1.5604405403137207 }, { "epoch": 0.272394700415266, "step": 2755, "train/sim_loss": 0.0703125 }, { "epoch": 0.272394700415266, "step": 2755, "train/total_loss": 0.22635655105113983 }, { "entropy": 8.602712631225586, "epoch": 0.27249357326478146, "mean_token_accuracy": 0.7453142404556274, "num_tokens": 15151800.0, "step": 2756, "train/ce_loss": 0.6832882165908813 }, { "epoch": 0.27249357326478146, "step": 2756, "train/sim_loss": 0.0703125 }, { "epoch": 0.27249357326478146, "step": 2756, "train/total_loss": 0.1386413276195526 }, { "entropy": 9.189170837402344, "epoch": 0.272592446114297, "mean_token_accuracy": 0.7813712954521179, "num_tokens": 15157209.0, "step": 2757, "train/ce_loss": 0.921110987663269 }, { "epoch": 0.272592446114297, "step": 2757, "train/sim_loss": 0.11328125 }, { "epoch": 0.272592446114297, "step": 2757, "train/total_loss": 0.20539236068725586 }, { "entropy": 8.635696411132812, "epoch": 0.27269131896381255, "mean_token_accuracy": 0.7512690424919128, "num_tokens": 15162791.0, "step": 2758, "train/ce_loss": 1.0701462030410767 }, { "epoch": 0.27269131896381255, "step": 2758, "train/sim_loss": 0.04296875 }, { "epoch": 0.27269131896381255, "step": 2758, "train/total_loss": 0.14998337626457214 }, { "entropy": 9.104533195495605, "epoch": 0.2727901918133281, "mean_token_accuracy": 0.7634408473968506, "num_tokens": 15168173.0, "step": 2759, "train/ce_loss": 0.7194211483001709 }, { "epoch": 0.2727901918133281, "step": 2759, "train/sim_loss": 0.08203125 }, { "epoch": 0.2727901918133281, "step": 2759, "train/total_loss": 0.15397337079048157 }, { "epoch": 0.2728890646628436, "grad_norm": 0.7687532901763916, "learning_rate": 9.320328339019929e-06, "loss": 0.142, "step": 2760 }, { "entropy": 8.946598052978516, "epoch": 0.2728890646628436, "mean_token_accuracy": 0.7069717049598694, "num_tokens": 15173684.0, "step": 2760, "train/ce_loss": 1.2463408708572388 }, { "epoch": 0.2728890646628436, "step": 2760, "train/sim_loss": 0.08203125 }, { "epoch": 0.2728890646628436, "step": 2760, "train/total_loss": 0.20666533708572388 }, { "entropy": 9.08772087097168, "epoch": 0.2729879375123591, "mean_token_accuracy": 0.6935704350471497, "num_tokens": 15179101.0, "step": 2761, "train/ce_loss": 1.3301407098770142 }, { "epoch": 0.2729879375123591, "step": 2761, "train/sim_loss": 0.05859375 }, { "epoch": 0.2729879375123591, "step": 2761, "train/total_loss": 0.19160781800746918 }, { "entropy": 9.238638877868652, "epoch": 0.27308681036187465, "mean_token_accuracy": 0.7409162521362305, "num_tokens": 15184346.0, "step": 2762, "train/ce_loss": 0.7890664339065552 }, { "epoch": 0.27308681036187465, "step": 2762, "train/sim_loss": 0.05078125 }, { "epoch": 0.27308681036187465, "step": 2762, "train/total_loss": 0.12968790531158447 }, { "entropy": 9.109575271606445, "epoch": 0.27318568321139014, "mean_token_accuracy": 0.7064102292060852, "num_tokens": 15189791.0, "step": 2763, "train/ce_loss": 0.7888553142547607 }, { "epoch": 0.27318568321139014, "step": 2763, "train/sim_loss": 0.0703125 }, { "epoch": 0.27318568321139014, "step": 2763, "train/total_loss": 0.1491980254650116 }, { "entropy": 8.68143081665039, "epoch": 0.2732845560609057, "mean_token_accuracy": 0.7219192981719971, "num_tokens": 15195265.0, "step": 2764, "train/ce_loss": 0.5985942482948303 }, { "epoch": 0.2732845560609057, "step": 2764, "train/sim_loss": 0.0390625 }, { "epoch": 0.2732845560609057, "step": 2764, "train/total_loss": 0.09892192482948303 }, { "entropy": 8.827057838439941, "epoch": 0.2733834289104212, "mean_token_accuracy": 0.7502774596214294, "num_tokens": 15200772.0, "step": 2765, "train/ce_loss": 0.9444445967674255 }, { "epoch": 0.2733834289104212, "step": 2765, "train/sim_loss": 0.0625 }, { "epoch": 0.2733834289104212, "step": 2765, "train/total_loss": 0.15694445371627808 }, { "entropy": 8.82568359375, "epoch": 0.2734823017599367, "mean_token_accuracy": 0.7095141410827637, "num_tokens": 15206384.0, "step": 2766, "train/ce_loss": 0.5892278552055359 }, { "epoch": 0.2734823017599367, "step": 2766, "train/sim_loss": 0.046875 }, { "epoch": 0.2734823017599367, "step": 2766, "train/total_loss": 0.10579778254032135 }, { "entropy": 8.938541412353516, "epoch": 0.27358117460945225, "mean_token_accuracy": 0.7684210538864136, "num_tokens": 15211913.0, "step": 2767, "train/ce_loss": 0.7373906373977661 }, { "epoch": 0.27358117460945225, "step": 2767, "train/sim_loss": 0.09375 }, { "epoch": 0.27358117460945225, "step": 2767, "train/total_loss": 0.16748906672000885 }, { "entropy": 9.14603042602539, "epoch": 0.2736800474589678, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 15217363.0, "step": 2768, "train/ce_loss": 0.7322885990142822 }, { "epoch": 0.2736800474589678, "step": 2768, "train/sim_loss": 0.078125 }, { "epoch": 0.2736800474589678, "step": 2768, "train/total_loss": 0.1513538658618927 }, { "entropy": 9.379240036010742, "epoch": 0.2737789203084833, "mean_token_accuracy": 0.7648809552192688, "num_tokens": 15222603.0, "step": 2769, "train/ce_loss": 0.8060787916183472 }, { "epoch": 0.2737789203084833, "step": 2769, "train/sim_loss": 0.046875 }, { "epoch": 0.2737789203084833, "step": 2769, "train/total_loss": 0.12748289108276367 }, { "entropy": 9.040748596191406, "epoch": 0.2738777931579988, "mean_token_accuracy": 0.7207415699958801, "num_tokens": 15228085.0, "step": 2770, "train/ce_loss": 1.366797685623169 }, { "epoch": 0.2738777931579988, "step": 2770, "train/sim_loss": 0.1015625 }, { "epoch": 0.2738777931579988, "step": 2770, "train/total_loss": 0.2382422685623169 }, { "entropy": 9.12544059753418, "epoch": 0.27397666600751436, "mean_token_accuracy": 0.7829839587211609, "num_tokens": 15233494.0, "step": 2771, "train/ce_loss": 0.5567338466644287 }, { "epoch": 0.27397666600751436, "step": 2771, "train/sim_loss": 0.0625 }, { "epoch": 0.27397666600751436, "step": 2771, "train/total_loss": 0.11817339062690735 }, { "entropy": 8.99071979522705, "epoch": 0.27407553885702984, "mean_token_accuracy": 0.771700382232666, "num_tokens": 15238965.0, "step": 2772, "train/ce_loss": 0.3417659103870392 }, { "epoch": 0.27407553885702984, "step": 2772, "train/sim_loss": 0.02734375 }, { "epoch": 0.27407553885702984, "step": 2772, "train/total_loss": 0.06152034178376198 }, { "entropy": 8.951696395874023, "epoch": 0.2741744117065454, "mean_token_accuracy": 0.73221755027771, "num_tokens": 15244494.0, "step": 2773, "train/ce_loss": 0.9184609651565552 }, { "epoch": 0.2741744117065454, "step": 2773, "train/sim_loss": 0.07421875 }, { "epoch": 0.2741744117065454, "step": 2773, "train/total_loss": 0.16606485843658447 }, { "entropy": 8.927734375, "epoch": 0.2742732845560609, "mean_token_accuracy": 0.7239766120910645, "num_tokens": 15249988.0, "step": 2774, "train/ce_loss": 1.075432538986206 }, { "epoch": 0.2742732845560609, "step": 2774, "train/sim_loss": 0.109375 }, { "epoch": 0.2742732845560609, "step": 2774, "train/total_loss": 0.21691825985908508 }, { "entropy": 9.192229270935059, "epoch": 0.2743721574055764, "mean_token_accuracy": 0.7268232107162476, "num_tokens": 15255362.0, "step": 2775, "train/ce_loss": 1.0655601024627686 }, { "epoch": 0.2743721574055764, "step": 2775, "train/sim_loss": 0.05859375 }, { "epoch": 0.2743721574055764, "step": 2775, "train/total_loss": 0.1651497632265091 }, { "entropy": 8.896421432495117, "epoch": 0.27447103025509195, "mean_token_accuracy": 0.7120084762573242, "num_tokens": 15260895.0, "step": 2776, "train/ce_loss": 0.8982424736022949 }, { "epoch": 0.27447103025509195, "step": 2776, "train/sim_loss": 0.109375 }, { "epoch": 0.27447103025509195, "step": 2776, "train/total_loss": 0.19919925928115845 }, { "entropy": 9.300304412841797, "epoch": 0.2745699031046075, "mean_token_accuracy": 0.7591888308525085, "num_tokens": 15266157.0, "step": 2777, "train/ce_loss": 0.6372509598731995 }, { "epoch": 0.2745699031046075, "step": 2777, "train/sim_loss": 0.05078125 }, { "epoch": 0.2745699031046075, "step": 2777, "train/total_loss": 0.11450634896755219 }, { "entropy": 8.740734100341797, "epoch": 0.274668775954123, "mean_token_accuracy": 0.7331042289733887, "num_tokens": 15271654.0, "step": 2778, "train/ce_loss": 0.8208907842636108 }, { "epoch": 0.274668775954123, "step": 2778, "train/sim_loss": 0.078125 }, { "epoch": 0.274668775954123, "step": 2778, "train/total_loss": 0.16021408140659332 }, { "entropy": 8.879854202270508, "epoch": 0.2747676488036385, "mean_token_accuracy": 0.8068181872367859, "num_tokens": 15277187.0, "step": 2779, "train/ce_loss": 1.0514099597930908 }, { "epoch": 0.2747676488036385, "step": 2779, "train/sim_loss": 0.046875 }, { "epoch": 0.2747676488036385, "step": 2779, "train/total_loss": 0.15201599895954132 }, { "epoch": 0.27486652165315406, "grad_norm": 0.740213930606842, "learning_rate": 9.31538347426198e-06, "loss": 0.1453, "step": 2780 }, { "entropy": 9.093647003173828, "epoch": 0.27486652165315406, "mean_token_accuracy": 0.7134570479393005, "num_tokens": 15282689.0, "step": 2780, "train/ce_loss": 0.7887966632843018 }, { "epoch": 0.27486652165315406, "step": 2780, "train/sim_loss": 0.0703125 }, { "epoch": 0.27486652165315406, "step": 2780, "train/total_loss": 0.14919216930866241 }, { "entropy": 8.994417190551758, "epoch": 0.27496539450266955, "mean_token_accuracy": 0.7666666507720947, "num_tokens": 15288170.0, "step": 2781, "train/ce_loss": 0.8710976839065552 }, { "epoch": 0.27496539450266955, "step": 2781, "train/sim_loss": 0.0859375 }, { "epoch": 0.27496539450266955, "step": 2781, "train/total_loss": 0.17304727435112 }, { "entropy": 8.726607322692871, "epoch": 0.2750642673521851, "mean_token_accuracy": 0.7287553548812866, "num_tokens": 15293907.0, "step": 2782, "train/ce_loss": 1.8811317682266235 }, { "epoch": 0.2750642673521851, "step": 2782, "train/sim_loss": 0.078125 }, { "epoch": 0.2750642673521851, "step": 2782, "train/total_loss": 0.26623818278312683 }, { "entropy": 8.928232192993164, "epoch": 0.27516314020170063, "mean_token_accuracy": 0.7206595540046692, "num_tokens": 15299602.0, "step": 2783, "train/ce_loss": 0.849024772644043 }, { "epoch": 0.27516314020170063, "step": 2783, "train/sim_loss": 0.09375 }, { "epoch": 0.27516314020170063, "step": 2783, "train/total_loss": 0.17865248024463654 }, { "entropy": 8.93177318572998, "epoch": 0.2752620130512161, "mean_token_accuracy": 0.7429805397987366, "num_tokens": 15305195.0, "step": 2784, "train/ce_loss": 0.6827086210250854 }, { "epoch": 0.2752620130512161, "step": 2784, "train/sim_loss": 0.01953125 }, { "epoch": 0.2752620130512161, "step": 2784, "train/total_loss": 0.08780211210250854 }, { "entropy": 8.831356048583984, "epoch": 0.27536088590073166, "mean_token_accuracy": 0.7350254058837891, "num_tokens": 15310862.0, "step": 2785, "train/ce_loss": 1.2548748254776 }, { "epoch": 0.27536088590073166, "step": 2785, "train/sim_loss": 0.0703125 }, { "epoch": 0.27536088590073166, "step": 2785, "train/total_loss": 0.19579999148845673 }, { "entropy": 9.136846542358398, "epoch": 0.2754597587502472, "mean_token_accuracy": 0.719072163105011, "num_tokens": 15316192.0, "step": 2786, "train/ce_loss": 1.1198638677597046 }, { "epoch": 0.2754597587502472, "step": 2786, "train/sim_loss": 0.07421875 }, { "epoch": 0.2754597587502472, "step": 2786, "train/total_loss": 0.18620514869689941 }, { "entropy": 9.32155990600586, "epoch": 0.2755586315997627, "mean_token_accuracy": 0.7092105150222778, "num_tokens": 15321526.0, "step": 2787, "train/ce_loss": 0.9706707000732422 }, { "epoch": 0.2755586315997627, "step": 2787, "train/sim_loss": 0.046875 }, { "epoch": 0.2755586315997627, "step": 2787, "train/total_loss": 0.14394207298755646 }, { "entropy": 9.340791702270508, "epoch": 0.2756575044492782, "mean_token_accuracy": 0.6971830725669861, "num_tokens": 15326830.0, "step": 2788, "train/ce_loss": 1.3964905738830566 }, { "epoch": 0.2756575044492782, "step": 2788, "train/sim_loss": 0.078125 }, { "epoch": 0.2756575044492782, "step": 2788, "train/total_loss": 0.21777406334877014 }, { "entropy": 8.970388412475586, "epoch": 0.27575637729879376, "mean_token_accuracy": 0.7372593283653259, "num_tokens": 15332270.0, "step": 2789, "train/ce_loss": 0.8262267708778381 }, { "epoch": 0.27575637729879376, "step": 2789, "train/sim_loss": 0.109375 }, { "epoch": 0.27575637729879376, "step": 2789, "train/total_loss": 0.1919976770877838 }, { "entropy": 9.196338653564453, "epoch": 0.27585525014830925, "mean_token_accuracy": 0.7520891427993774, "num_tokens": 15337500.0, "step": 2790, "train/ce_loss": 1.0021562576293945 }, { "epoch": 0.27585525014830925, "step": 2790, "train/sim_loss": 0.078125 }, { "epoch": 0.27585525014830925, "step": 2790, "train/total_loss": 0.1783406287431717 }, { "entropy": 9.225892066955566, "epoch": 0.2759541229978248, "mean_token_accuracy": 0.7766871452331543, "num_tokens": 15342957.0, "step": 2791, "train/ce_loss": 0.7255532741546631 }, { "epoch": 0.2759541229978248, "step": 2791, "train/sim_loss": 0.0625 }, { "epoch": 0.2759541229978248, "step": 2791, "train/total_loss": 0.1350553333759308 }, { "entropy": 8.999250411987305, "epoch": 0.27605299584734033, "mean_token_accuracy": 0.7154663801193237, "num_tokens": 15348424.0, "step": 2792, "train/ce_loss": 0.5931878089904785 }, { "epoch": 0.27605299584734033, "step": 2792, "train/sim_loss": 0.08203125 }, { "epoch": 0.27605299584734033, "step": 2792, "train/total_loss": 0.14135003089904785 }, { "entropy": 8.943933486938477, "epoch": 0.2761518686968558, "mean_token_accuracy": 0.8127018213272095, "num_tokens": 15353939.0, "step": 2793, "train/ce_loss": 0.635563850402832 }, { "epoch": 0.2761518686968558, "step": 2793, "train/sim_loss": 0.02734375 }, { "epoch": 0.2761518686968558, "step": 2793, "train/total_loss": 0.09090013802051544 }, { "entropy": 9.446842193603516, "epoch": 0.27625074154637136, "mean_token_accuracy": 0.7764900922775269, "num_tokens": 15359154.0, "step": 2794, "train/ce_loss": 0.5520123839378357 }, { "epoch": 0.27625074154637136, "step": 2794, "train/sim_loss": 0.0625 }, { "epoch": 0.27625074154637136, "step": 2794, "train/total_loss": 0.11770123988389969 }, { "entropy": 9.13991928100586, "epoch": 0.2763496143958869, "mean_token_accuracy": 0.7344497442245483, "num_tokens": 15364586.0, "step": 2795, "train/ce_loss": 0.40625664591789246 }, { "epoch": 0.2763496143958869, "step": 2795, "train/sim_loss": 0.0703125 }, { "epoch": 0.2763496143958869, "step": 2795, "train/total_loss": 0.110938161611557 }, { "entropy": 9.334680557250977, "epoch": 0.2764484872454024, "mean_token_accuracy": 0.7387057542800903, "num_tokens": 15370004.0, "step": 2796, "train/ce_loss": 0.715522825717926 }, { "epoch": 0.2764484872454024, "step": 2796, "train/sim_loss": 0.09375 }, { "epoch": 0.2764484872454024, "step": 2796, "train/total_loss": 0.16530227661132812 }, { "entropy": 8.855464935302734, "epoch": 0.2765473600949179, "mean_token_accuracy": 0.7235682606697083, "num_tokens": 15375569.0, "step": 2797, "train/ce_loss": 0.4417724311351776 }, { "epoch": 0.2765473600949179, "step": 2797, "train/sim_loss": 0.078125 }, { "epoch": 0.2765473600949179, "step": 2797, "train/total_loss": 0.12230224907398224 }, { "entropy": 9.289267539978027, "epoch": 0.27664623294443347, "mean_token_accuracy": 0.73051518201828, "num_tokens": 15380945.0, "step": 2798, "train/ce_loss": 0.843120276927948 }, { "epoch": 0.27664623294443347, "step": 2798, "train/sim_loss": 0.0234375 }, { "epoch": 0.27664623294443347, "step": 2798, "train/total_loss": 0.10774952918291092 }, { "entropy": 9.03884220123291, "epoch": 0.276745105793949, "mean_token_accuracy": 0.7204724550247192, "num_tokens": 15386302.0, "step": 2799, "train/ce_loss": 0.9210829734802246 }, { "epoch": 0.276745105793949, "step": 2799, "train/sim_loss": 0.07421875 }, { "epoch": 0.276745105793949, "step": 2799, "train/total_loss": 0.16632705926895142 }, { "epoch": 0.2768439786434645, "grad_norm": 0.8680959343910217, "learning_rate": 9.310438609504032e-06, "loss": 0.1503, "step": 2800 }, { "entropy": 8.895750045776367, "epoch": 0.2768439786434645, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 15391863.0, "step": 2800, "train/ce_loss": 0.30964335799217224 }, { "epoch": 0.2768439786434645, "step": 2800, "train/sim_loss": 0.0546875 }, { "epoch": 0.2768439786434645, "step": 2800, "train/total_loss": 0.08565183728933334 }, { "entropy": 9.095988273620605, "epoch": 0.27694285149298004, "mean_token_accuracy": 0.7514654397964478, "num_tokens": 15397261.0, "step": 2801, "train/ce_loss": 0.37885233759880066 }, { "epoch": 0.27694285149298004, "step": 2801, "train/sim_loss": 0.01953125 }, { "epoch": 0.27694285149298004, "step": 2801, "train/total_loss": 0.057416483759880066 }, { "entropy": 9.120649337768555, "epoch": 0.2770417243424956, "mean_token_accuracy": 0.7854630947113037, "num_tokens": 15402731.0, "step": 2802, "train/ce_loss": 0.7187972068786621 }, { "epoch": 0.2770417243424956, "step": 2802, "train/sim_loss": 0.078125 }, { "epoch": 0.2770417243424956, "step": 2802, "train/total_loss": 0.15000471472740173 }, { "entropy": 9.258299827575684, "epoch": 0.27714059719201106, "mean_token_accuracy": 0.7568238377571106, "num_tokens": 15408039.0, "step": 2803, "train/ce_loss": 0.7097336053848267 }, { "epoch": 0.27714059719201106, "step": 2803, "train/sim_loss": 0.03125 }, { "epoch": 0.27714059719201106, "step": 2803, "train/total_loss": 0.10222335904836655 }, { "entropy": 9.077465057373047, "epoch": 0.2772394700415266, "mean_token_accuracy": 0.7570093274116516, "num_tokens": 15413414.0, "step": 2804, "train/ce_loss": 0.9463093280792236 }, { "epoch": 0.2772394700415266, "step": 2804, "train/sim_loss": 0.05078125 }, { "epoch": 0.2772394700415266, "step": 2804, "train/total_loss": 0.14541217684745789 }, { "entropy": 9.145472526550293, "epoch": 0.27733834289104214, "mean_token_accuracy": 0.7394636273384094, "num_tokens": 15418754.0, "step": 2805, "train/ce_loss": 0.9205010533332825 }, { "epoch": 0.27733834289104214, "step": 2805, "train/sim_loss": 0.06640625 }, { "epoch": 0.27733834289104214, "step": 2805, "train/total_loss": 0.15845635533332825 }, { "entropy": 9.06934928894043, "epoch": 0.27743721574055763, "mean_token_accuracy": 0.7458646893501282, "num_tokens": 15424123.0, "step": 2806, "train/ce_loss": 0.6518387794494629 }, { "epoch": 0.27743721574055763, "step": 2806, "train/sim_loss": 0.078125 }, { "epoch": 0.27743721574055763, "step": 2806, "train/total_loss": 0.1433088779449463 }, { "entropy": 8.841963768005371, "epoch": 0.27753608859007317, "mean_token_accuracy": 0.7508379817008972, "num_tokens": 15429638.0, "step": 2807, "train/ce_loss": 0.8197516798973083 }, { "epoch": 0.27753608859007317, "step": 2807, "train/sim_loss": 0.05078125 }, { "epoch": 0.27753608859007317, "step": 2807, "train/total_loss": 0.13275641202926636 }, { "entropy": 9.216567993164062, "epoch": 0.2776349614395887, "mean_token_accuracy": 0.7660208940505981, "num_tokens": 15434951.0, "step": 2808, "train/ce_loss": 0.7483777403831482 }, { "epoch": 0.2776349614395887, "step": 2808, "train/sim_loss": 0.0703125 }, { "epoch": 0.2776349614395887, "step": 2808, "train/total_loss": 0.14515027403831482 }, { "entropy": 8.905106544494629, "epoch": 0.2777338342891042, "mean_token_accuracy": 0.8087934851646423, "num_tokens": 15440546.0, "step": 2809, "train/ce_loss": 0.590493381023407 }, { "epoch": 0.2777338342891042, "step": 2809, "train/sim_loss": 0.0234375 }, { "epoch": 0.2777338342891042, "step": 2809, "train/total_loss": 0.0824868381023407 }, { "entropy": 9.007984161376953, "epoch": 0.27783270713861974, "mean_token_accuracy": 0.751937985420227, "num_tokens": 15446102.0, "step": 2810, "train/ce_loss": 1.1346385478973389 }, { "epoch": 0.27783270713861974, "step": 2810, "train/sim_loss": 0.08203125 }, { "epoch": 0.27783270713861974, "step": 2810, "train/total_loss": 0.1954950988292694 }, { "entropy": 9.033472061157227, "epoch": 0.2779315799881353, "mean_token_accuracy": 0.7204301357269287, "num_tokens": 15451535.0, "step": 2811, "train/ce_loss": 0.771938681602478 }, { "epoch": 0.2779315799881353, "step": 2811, "train/sim_loss": 0.046875 }, { "epoch": 0.2779315799881353, "step": 2811, "train/total_loss": 0.12406887114048004 }, { "entropy": 8.696282386779785, "epoch": 0.27803045283765077, "mean_token_accuracy": 0.7621951103210449, "num_tokens": 15457082.0, "step": 2812, "train/ce_loss": 0.6210614442825317 }, { "epoch": 0.27803045283765077, "step": 2812, "train/sim_loss": 0.08984375 }, { "epoch": 0.27803045283765077, "step": 2812, "train/total_loss": 0.1519498974084854 }, { "entropy": 9.153282165527344, "epoch": 0.2781293256871663, "mean_token_accuracy": 0.8284424543380737, "num_tokens": 15462606.0, "step": 2813, "train/ce_loss": 0.42017117142677307 }, { "epoch": 0.2781293256871663, "step": 2813, "train/sim_loss": 0.109375 }, { "epoch": 0.2781293256871663, "step": 2813, "train/total_loss": 0.1513921171426773 }, { "entropy": 8.909027099609375, "epoch": 0.27822819853668185, "mean_token_accuracy": 0.734455943107605, "num_tokens": 15468015.0, "step": 2814, "train/ce_loss": 0.5906887054443359 }, { "epoch": 0.27822819853668185, "step": 2814, "train/sim_loss": 0.05859375 }, { "epoch": 0.27822819853668185, "step": 2814, "train/total_loss": 0.11766262352466583 }, { "entropy": 8.939517974853516, "epoch": 0.27832707138619733, "mean_token_accuracy": 0.7304075360298157, "num_tokens": 15473579.0, "step": 2815, "train/ce_loss": 0.5898017883300781 }, { "epoch": 0.27832707138619733, "step": 2815, "train/sim_loss": 0.0859375 }, { "epoch": 0.27832707138619733, "step": 2815, "train/total_loss": 0.14491768181324005 }, { "entropy": 8.95665454864502, "epoch": 0.2784259442357129, "mean_token_accuracy": 0.7379553318023682, "num_tokens": 15479054.0, "step": 2816, "train/ce_loss": 0.7665243744850159 }, { "epoch": 0.2784259442357129, "step": 2816, "train/sim_loss": 0.078125 }, { "epoch": 0.2784259442357129, "step": 2816, "train/total_loss": 0.1547774374485016 }, { "entropy": 9.224123001098633, "epoch": 0.2785248170852284, "mean_token_accuracy": 0.7861271500587463, "num_tokens": 15484381.0, "step": 2817, "train/ce_loss": 0.42708295583724976 }, { "epoch": 0.2785248170852284, "step": 2817, "train/sim_loss": 0.05859375 }, { "epoch": 0.2785248170852284, "step": 2817, "train/total_loss": 0.10130204260349274 }, { "entropy": 9.095832824707031, "epoch": 0.2786236899347439, "mean_token_accuracy": 0.6712172627449036, "num_tokens": 15489868.0, "step": 2818, "train/ce_loss": 0.9770092368125916 }, { "epoch": 0.2786236899347439, "step": 2818, "train/sim_loss": 0.03125 }, { "epoch": 0.2786236899347439, "step": 2818, "train/total_loss": 0.12895092368125916 }, { "entropy": 9.191495895385742, "epoch": 0.27872256278425944, "mean_token_accuracy": 0.6993464231491089, "num_tokens": 15495193.0, "step": 2819, "train/ce_loss": 0.6071676015853882 }, { "epoch": 0.27872256278425944, "step": 2819, "train/sim_loss": 0.0546875 }, { "epoch": 0.27872256278425944, "step": 2819, "train/total_loss": 0.11540426313877106 }, { "epoch": 0.278821435633775, "grad_norm": 0.8557233810424805, "learning_rate": 9.305493744746082e-06, "loss": 0.1431, "step": 2820 }, { "entropy": 8.83104133605957, "epoch": 0.278821435633775, "mean_token_accuracy": 0.772357702255249, "num_tokens": 15500677.0, "step": 2820, "train/ce_loss": 1.2988338470458984 }, { "epoch": 0.278821435633775, "step": 2820, "train/sim_loss": 0.08203125 }, { "epoch": 0.278821435633775, "step": 2820, "train/total_loss": 0.21191464364528656 }, { "entropy": 9.269227981567383, "epoch": 0.27892030848329047, "mean_token_accuracy": 0.7473261952400208, "num_tokens": 15506019.0, "step": 2821, "train/ce_loss": 0.7638499140739441 }, { "epoch": 0.27892030848329047, "step": 2821, "train/sim_loss": 0.07421875 }, { "epoch": 0.27892030848329047, "step": 2821, "train/total_loss": 0.1506037414073944 }, { "entropy": 8.865447998046875, "epoch": 0.279019181332806, "mean_token_accuracy": 0.7631579041481018, "num_tokens": 15511706.0, "step": 2822, "train/ce_loss": 1.5393993854522705 }, { "epoch": 0.279019181332806, "step": 2822, "train/sim_loss": 0.1328125 }, { "epoch": 0.279019181332806, "step": 2822, "train/total_loss": 0.28675246238708496 }, { "entropy": 9.021322250366211, "epoch": 0.27911805418232155, "mean_token_accuracy": 0.7606837749481201, "num_tokens": 15517114.0, "step": 2823, "train/ce_loss": 0.8194016814231873 }, { "epoch": 0.27911805418232155, "step": 2823, "train/sim_loss": 0.0859375 }, { "epoch": 0.27911805418232155, "step": 2823, "train/total_loss": 0.1678776741027832 }, { "entropy": 8.855917930603027, "epoch": 0.27921692703183704, "mean_token_accuracy": 0.7029598355293274, "num_tokens": 15522681.0, "step": 2824, "train/ce_loss": 0.748229444026947 }, { "epoch": 0.27921692703183704, "step": 2824, "train/sim_loss": 0.06640625 }, { "epoch": 0.27921692703183704, "step": 2824, "train/total_loss": 0.14122919738292694 }, { "entropy": 9.060811996459961, "epoch": 0.2793157998813526, "mean_token_accuracy": 0.7451456189155579, "num_tokens": 15528131.0, "step": 2825, "train/ce_loss": 0.939194917678833 }, { "epoch": 0.2793157998813526, "step": 2825, "train/sim_loss": 0.0234375 }, { "epoch": 0.2793157998813526, "step": 2825, "train/total_loss": 0.11735699325799942 }, { "entropy": 8.25743293762207, "epoch": 0.2794146727308681, "mean_token_accuracy": 0.6644784808158875, "num_tokens": 15534085.0, "step": 2826, "train/ce_loss": 1.0090254545211792 }, { "epoch": 0.2794146727308681, "step": 2826, "train/sim_loss": 0.05859375 }, { "epoch": 0.2794146727308681, "step": 2826, "train/total_loss": 0.15949630737304688 }, { "entropy": 8.714340209960938, "epoch": 0.2795135455803836, "mean_token_accuracy": 0.6910946369171143, "num_tokens": 15539751.0, "step": 2827, "train/ce_loss": 0.8696286678314209 }, { "epoch": 0.2795135455803836, "step": 2827, "train/sim_loss": 0.0546875 }, { "epoch": 0.2795135455803836, "step": 2827, "train/total_loss": 0.14165037870407104 }, { "entropy": 9.105966567993164, "epoch": 0.27961241842989915, "mean_token_accuracy": 0.7397435903549194, "num_tokens": 15545173.0, "step": 2828, "train/ce_loss": 0.7933923006057739 }, { "epoch": 0.27961241842989915, "step": 2828, "train/sim_loss": 0.078125 }, { "epoch": 0.27961241842989915, "step": 2828, "train/total_loss": 0.15746423602104187 }, { "entropy": 9.085497856140137, "epoch": 0.2797112912794147, "mean_token_accuracy": 0.76752769947052, "num_tokens": 15550811.0, "step": 2829, "train/ce_loss": 0.663153886795044 }, { "epoch": 0.2797112912794147, "step": 2829, "train/sim_loss": 0.1328125 }, { "epoch": 0.2797112912794147, "step": 2829, "train/total_loss": 0.19912788271903992 }, { "entropy": 9.086774826049805, "epoch": 0.27981016412893017, "mean_token_accuracy": 0.7402439117431641, "num_tokens": 15556215.0, "step": 2830, "train/ce_loss": 0.9601312875747681 }, { "epoch": 0.27981016412893017, "step": 2830, "train/sim_loss": 0.01953125 }, { "epoch": 0.27981016412893017, "step": 2830, "train/total_loss": 0.1155443787574768 }, { "entropy": 9.065472602844238, "epoch": 0.2799090369784457, "mean_token_accuracy": 0.7153931260108948, "num_tokens": 15561762.0, "step": 2831, "train/ce_loss": 0.6420674324035645 }, { "epoch": 0.2799090369784457, "step": 2831, "train/sim_loss": 0.03515625 }, { "epoch": 0.2799090369784457, "step": 2831, "train/total_loss": 0.09936299175024033 }, { "entropy": 9.237410545349121, "epoch": 0.28000790982796125, "mean_token_accuracy": 0.7313019633293152, "num_tokens": 15567130.0, "step": 2832, "train/ce_loss": 0.6669164299964905 }, { "epoch": 0.28000790982796125, "step": 2832, "train/sim_loss": 0.0625 }, { "epoch": 0.28000790982796125, "step": 2832, "train/total_loss": 0.12919163703918457 }, { "entropy": 8.941696166992188, "epoch": 0.28010678267747674, "mean_token_accuracy": 0.7160087823867798, "num_tokens": 15572636.0, "step": 2833, "train/ce_loss": 1.391119122505188 }, { "epoch": 0.28010678267747674, "step": 2833, "train/sim_loss": 0.109375 }, { "epoch": 0.28010678267747674, "step": 2833, "train/total_loss": 0.24848692119121552 }, { "entropy": 9.245611190795898, "epoch": 0.2802056555269923, "mean_token_accuracy": 0.7673202753067017, "num_tokens": 15578002.0, "step": 2834, "train/ce_loss": 0.9585338830947876 }, { "epoch": 0.2802056555269923, "step": 2834, "train/sim_loss": 0.0390625 }, { "epoch": 0.2802056555269923, "step": 2834, "train/total_loss": 0.13491588830947876 }, { "entropy": 8.982292175292969, "epoch": 0.2803045283765078, "mean_token_accuracy": 0.7367088794708252, "num_tokens": 15583400.0, "step": 2835, "train/ce_loss": 0.9219286441802979 }, { "epoch": 0.2803045283765078, "step": 2835, "train/sim_loss": 0.08203125 }, { "epoch": 0.2803045283765078, "step": 2835, "train/total_loss": 0.1742241084575653 }, { "entropy": 9.15916919708252, "epoch": 0.2804034012260233, "mean_token_accuracy": 0.7779220938682556, "num_tokens": 15588822.0, "step": 2836, "train/ce_loss": 0.35030072927474976 }, { "epoch": 0.2804034012260233, "step": 2836, "train/sim_loss": 0.0859375 }, { "epoch": 0.2804034012260233, "step": 2836, "train/total_loss": 0.1209675744175911 }, { "entropy": 8.861663818359375, "epoch": 0.28050227407553885, "mean_token_accuracy": 0.75157630443573, "num_tokens": 15594278.0, "step": 2837, "train/ce_loss": 0.9835109710693359 }, { "epoch": 0.28050227407553885, "step": 2837, "train/sim_loss": 0.0546875 }, { "epoch": 0.28050227407553885, "step": 2837, "train/total_loss": 0.15303859114646912 }, { "entropy": 8.971227645874023, "epoch": 0.2806011469250544, "mean_token_accuracy": 0.7548308968544006, "num_tokens": 15599722.0, "step": 2838, "train/ce_loss": 0.9045187830924988 }, { "epoch": 0.2806011469250544, "step": 2838, "train/sim_loss": 0.06640625 }, { "epoch": 0.2806011469250544, "step": 2838, "train/total_loss": 0.15685813128948212 }, { "entropy": 8.759530067443848, "epoch": 0.2807000197745699, "mean_token_accuracy": 0.6682555079460144, "num_tokens": 15605373.0, "step": 2839, "train/ce_loss": 0.7360123991966248 }, { "epoch": 0.2807000197745699, "step": 2839, "train/sim_loss": 0.1328125 }, { "epoch": 0.2807000197745699, "step": 2839, "train/total_loss": 0.20641374588012695 }, { "epoch": 0.2807988926240854, "grad_norm": 0.9163339138031006, "learning_rate": 9.300548879988133e-06, "loss": 0.1546, "step": 2840 }, { "entropy": 9.212146759033203, "epoch": 0.2807988926240854, "mean_token_accuracy": 0.7440224885940552, "num_tokens": 15610641.0, "step": 2840, "train/ce_loss": 0.7035618424415588 }, { "epoch": 0.2807988926240854, "step": 2840, "train/sim_loss": 0.03125 }, { "epoch": 0.2807988926240854, "step": 2840, "train/total_loss": 0.10160618275403976 }, { "entropy": 8.962800979614258, "epoch": 0.28089776547360096, "mean_token_accuracy": 0.7335723042488098, "num_tokens": 15616147.0, "step": 2841, "train/ce_loss": 0.8099513053894043 }, { "epoch": 0.28089776547360096, "step": 2841, "train/sim_loss": 0.08984375 }, { "epoch": 0.28089776547360096, "step": 2841, "train/total_loss": 0.17083889245986938 }, { "entropy": 9.291296005249023, "epoch": 0.2809966383231165, "mean_token_accuracy": 0.8040057420730591, "num_tokens": 15621487.0, "step": 2842, "train/ce_loss": 0.43099430203437805 }, { "epoch": 0.2809966383231165, "step": 2842, "train/sim_loss": 0.09765625 }, { "epoch": 0.2809966383231165, "step": 2842, "train/total_loss": 0.14075568318367004 }, { "entropy": 8.973970413208008, "epoch": 0.281095511172632, "mean_token_accuracy": 0.7796420454978943, "num_tokens": 15626956.0, "step": 2843, "train/ce_loss": 0.5995858311653137 }, { "epoch": 0.281095511172632, "step": 2843, "train/sim_loss": 0.05859375 }, { "epoch": 0.281095511172632, "step": 2843, "train/total_loss": 0.11855233460664749 }, { "entropy": 8.867843627929688, "epoch": 0.2811943840221475, "mean_token_accuracy": 0.6794208884239197, "num_tokens": 15632582.0, "step": 2844, "train/ce_loss": 0.6715723276138306 }, { "epoch": 0.2811943840221475, "step": 2844, "train/sim_loss": 0.0546875 }, { "epoch": 0.2811943840221475, "step": 2844, "train/total_loss": 0.12184473127126694 }, { "entropy": 8.882942199707031, "epoch": 0.28129325687166307, "mean_token_accuracy": 0.7848101258277893, "num_tokens": 15638113.0, "step": 2845, "train/ce_loss": 0.6385392546653748 }, { "epoch": 0.28129325687166307, "step": 2845, "train/sim_loss": 0.06640625 }, { "epoch": 0.28129325687166307, "step": 2845, "train/total_loss": 0.130260169506073 }, { "entropy": 8.96957015991211, "epoch": 0.28139212972117855, "mean_token_accuracy": 0.7341317534446716, "num_tokens": 15643555.0, "step": 2846, "train/ce_loss": 0.7602388262748718 }, { "epoch": 0.28139212972117855, "step": 2846, "train/sim_loss": 0.03515625 }, { "epoch": 0.28139212972117855, "step": 2846, "train/total_loss": 0.1111801341176033 }, { "entropy": 8.980449676513672, "epoch": 0.2814910025706941, "mean_token_accuracy": 0.745555579662323, "num_tokens": 15649065.0, "step": 2847, "train/ce_loss": 0.6544799208641052 }, { "epoch": 0.2814910025706941, "step": 2847, "train/sim_loss": 0.09375 }, { "epoch": 0.2814910025706941, "step": 2847, "train/total_loss": 0.15919798612594604 }, { "entropy": 9.26309585571289, "epoch": 0.28158987542020963, "mean_token_accuracy": 0.7524154782295227, "num_tokens": 15654473.0, "step": 2848, "train/ce_loss": 0.6890347003936768 }, { "epoch": 0.28158987542020963, "step": 2848, "train/sim_loss": 0.03125 }, { "epoch": 0.28158987542020963, "step": 2848, "train/total_loss": 0.10015346854925156 }, { "entropy": 8.441256523132324, "epoch": 0.2816887482697251, "mean_token_accuracy": 0.7280966639518738, "num_tokens": 15660064.0, "step": 2849, "train/ce_loss": 0.5744578242301941 }, { "epoch": 0.2816887482697251, "step": 2849, "train/sim_loss": 0.05859375 }, { "epoch": 0.2816887482697251, "step": 2849, "train/total_loss": 0.11603952944278717 }, { "entropy": 8.662765502929688, "epoch": 0.28178762111924066, "mean_token_accuracy": 0.7037814855575562, "num_tokens": 15665582.0, "step": 2850, "train/ce_loss": 1.2507582902908325 }, { "epoch": 0.28178762111924066, "step": 2850, "train/sim_loss": 0.078125 }, { "epoch": 0.28178762111924066, "step": 2850, "train/total_loss": 0.2032008320093155 }, { "entropy": 9.040231704711914, "epoch": 0.2818864939687562, "mean_token_accuracy": 0.7787810564041138, "num_tokens": 15671009.0, "step": 2851, "train/ce_loss": 0.7193268537521362 }, { "epoch": 0.2818864939687562, "step": 2851, "train/sim_loss": 0.02734375 }, { "epoch": 0.2818864939687562, "step": 2851, "train/total_loss": 0.09927643835544586 }, { "entropy": 8.776036262512207, "epoch": 0.2819853668182717, "mean_token_accuracy": 0.7395833134651184, "num_tokens": 15676621.0, "step": 2852, "train/ce_loss": 0.6679022312164307 }, { "epoch": 0.2819853668182717, "step": 2852, "train/sim_loss": 0.05078125 }, { "epoch": 0.2819853668182717, "step": 2852, "train/total_loss": 0.11757147312164307 }, { "entropy": 9.161849021911621, "epoch": 0.28208423966778723, "mean_token_accuracy": 0.752043604850769, "num_tokens": 15681919.0, "step": 2853, "train/ce_loss": 1.1990145444869995 }, { "epoch": 0.28208423966778723, "step": 2853, "train/sim_loss": 0.09375 }, { "epoch": 0.28208423966778723, "step": 2853, "train/total_loss": 0.21365144848823547 }, { "entropy": 9.199319839477539, "epoch": 0.28218311251730277, "mean_token_accuracy": 0.761255145072937, "num_tokens": 15687279.0, "step": 2854, "train/ce_loss": 0.85629802942276 }, { "epoch": 0.28218311251730277, "step": 2854, "train/sim_loss": 0.0546875 }, { "epoch": 0.28218311251730277, "step": 2854, "train/total_loss": 0.14031730592250824 }, { "entropy": 9.171686172485352, "epoch": 0.28228198536681826, "mean_token_accuracy": 0.7632508873939514, "num_tokens": 15692690.0, "step": 2855, "train/ce_loss": 0.49775218963623047 }, { "epoch": 0.28228198536681826, "step": 2855, "train/sim_loss": 0.078125 }, { "epoch": 0.28228198536681826, "step": 2855, "train/total_loss": 0.12790021300315857 }, { "entropy": 9.264739036560059, "epoch": 0.2823808582163338, "mean_token_accuracy": 0.7529566287994385, "num_tokens": 15698058.0, "step": 2856, "train/ce_loss": 0.4153069853782654 }, { "epoch": 0.2823808582163338, "step": 2856, "train/sim_loss": 0.0234375 }, { "epoch": 0.2823808582163338, "step": 2856, "train/total_loss": 0.06496819853782654 }, { "entropy": 8.694707870483398, "epoch": 0.28247973106584934, "mean_token_accuracy": 0.716269850730896, "num_tokens": 15703756.0, "step": 2857, "train/ce_loss": 0.6911802291870117 }, { "epoch": 0.28247973106584934, "step": 2857, "train/sim_loss": 0.0234375 }, { "epoch": 0.28247973106584934, "step": 2857, "train/total_loss": 0.09255552291870117 }, { "entropy": 9.319952011108398, "epoch": 0.2825786039153648, "mean_token_accuracy": 0.7017045617103577, "num_tokens": 15709033.0, "step": 2858, "train/ce_loss": 0.942584216594696 }, { "epoch": 0.2825786039153648, "step": 2858, "train/sim_loss": 0.1015625 }, { "epoch": 0.2825786039153648, "step": 2858, "train/total_loss": 0.19582092761993408 }, { "entropy": 8.813804626464844, "epoch": 0.28267747676488036, "mean_token_accuracy": 0.7402101159095764, "num_tokens": 15714698.0, "step": 2859, "train/ce_loss": 0.7038694620132446 }, { "epoch": 0.28267747676488036, "step": 2859, "train/sim_loss": 0.0390625 }, { "epoch": 0.28267747676488036, "step": 2859, "train/total_loss": 0.10944944620132446 }, { "epoch": 0.2827763496143959, "grad_norm": 0.808752715587616, "learning_rate": 9.295604015230185e-06, "loss": 0.1503, "step": 2860 }, { "entropy": 8.685885429382324, "epoch": 0.2827763496143959, "mean_token_accuracy": 0.768750011920929, "num_tokens": 15720113.0, "step": 2860, "train/ce_loss": 0.6707696318626404 }, { "epoch": 0.2827763496143959, "step": 2860, "train/sim_loss": 0.05859375 }, { "epoch": 0.2827763496143959, "step": 2860, "train/total_loss": 0.12567071616649628 }, { "entropy": 8.887486457824707, "epoch": 0.2828752224639114, "mean_token_accuracy": 0.7223404049873352, "num_tokens": 15725683.0, "step": 2861, "train/ce_loss": 0.5524101257324219 }, { "epoch": 0.2828752224639114, "step": 2861, "train/sim_loss": 0.06640625 }, { "epoch": 0.2828752224639114, "step": 2861, "train/total_loss": 0.12164726853370667 }, { "entropy": 9.228086471557617, "epoch": 0.28297409531342693, "mean_token_accuracy": 0.7573333382606506, "num_tokens": 15731048.0, "step": 2862, "train/ce_loss": 0.8286554217338562 }, { "epoch": 0.28297409531342693, "step": 2862, "train/sim_loss": 0.03125 }, { "epoch": 0.28297409531342693, "step": 2862, "train/total_loss": 0.11411554366350174 }, { "entropy": 9.014228820800781, "epoch": 0.2830729681629425, "mean_token_accuracy": 0.7710437774658203, "num_tokens": 15736579.0, "step": 2863, "train/ce_loss": 0.3857502341270447 }, { "epoch": 0.2830729681629425, "step": 2863, "train/sim_loss": 0.03125 }, { "epoch": 0.2830729681629425, "step": 2863, "train/total_loss": 0.06982502341270447 }, { "entropy": 9.071747779846191, "epoch": 0.28317184101245796, "mean_token_accuracy": 0.7093153595924377, "num_tokens": 15742100.0, "step": 2864, "train/ce_loss": 1.2877171039581299 }, { "epoch": 0.28317184101245796, "step": 2864, "train/sim_loss": 0.109375 }, { "epoch": 0.28317184101245796, "step": 2864, "train/total_loss": 0.23814670741558075 }, { "entropy": 8.961441040039062, "epoch": 0.2832707138619735, "mean_token_accuracy": 0.8214285969734192, "num_tokens": 15747617.0, "step": 2865, "train/ce_loss": 0.4224635064601898 }, { "epoch": 0.2832707138619735, "step": 2865, "train/sim_loss": 0.03125 }, { "epoch": 0.2832707138619735, "step": 2865, "train/total_loss": 0.07349635660648346 }, { "entropy": 9.172062873840332, "epoch": 0.28336958671148904, "mean_token_accuracy": 0.70540851354599, "num_tokens": 15753125.0, "step": 2866, "train/ce_loss": 0.7712848782539368 }, { "epoch": 0.28336958671148904, "step": 2866, "train/sim_loss": 0.08984375 }, { "epoch": 0.28336958671148904, "step": 2866, "train/total_loss": 0.16697224974632263 }, { "entropy": 8.916380882263184, "epoch": 0.2834684595610045, "mean_token_accuracy": 0.7697160840034485, "num_tokens": 15758700.0, "step": 2867, "train/ce_loss": 0.773253321647644 }, { "epoch": 0.2834684595610045, "step": 2867, "train/sim_loss": 0.0703125 }, { "epoch": 0.2834684595610045, "step": 2867, "train/total_loss": 0.14763784408569336 }, { "entropy": 8.942264556884766, "epoch": 0.28356733241052007, "mean_token_accuracy": 0.7244786024093628, "num_tokens": 15764249.0, "step": 2868, "train/ce_loss": 1.2858471870422363 }, { "epoch": 0.28356733241052007, "step": 2868, "train/sim_loss": 0.05859375 }, { "epoch": 0.28356733241052007, "step": 2868, "train/total_loss": 0.18717847764492035 }, { "entropy": 9.123357772827148, "epoch": 0.2836662052600356, "mean_token_accuracy": 0.7582781314849854, "num_tokens": 15769769.0, "step": 2869, "train/ce_loss": 0.408957302570343 }, { "epoch": 0.2836662052600356, "step": 2869, "train/sim_loss": 0.0625 }, { "epoch": 0.2836662052600356, "step": 2869, "train/total_loss": 0.1033957302570343 }, { "entropy": 8.973085403442383, "epoch": 0.2837650781095511, "mean_token_accuracy": 0.7729257345199585, "num_tokens": 15775400.0, "step": 2870, "train/ce_loss": 0.4746053218841553 }, { "epoch": 0.2837650781095511, "step": 2870, "train/sim_loss": 0.0859375 }, { "epoch": 0.2837650781095511, "step": 2870, "train/total_loss": 0.13339802622795105 }, { "entropy": 9.132635116577148, "epoch": 0.28386395095906664, "mean_token_accuracy": 0.706818163394928, "num_tokens": 15780862.0, "step": 2871, "train/ce_loss": 1.0251860618591309 }, { "epoch": 0.28386395095906664, "step": 2871, "train/sim_loss": 0.046875 }, { "epoch": 0.28386395095906664, "step": 2871, "train/total_loss": 0.14939361810684204 }, { "entropy": 8.847405433654785, "epoch": 0.2839628238085822, "mean_token_accuracy": 0.7508055567741394, "num_tokens": 15786459.0, "step": 2872, "train/ce_loss": 0.635254979133606 }, { "epoch": 0.2839628238085822, "step": 2872, "train/sim_loss": 0.0546875 }, { "epoch": 0.2839628238085822, "step": 2872, "train/total_loss": 0.1182129979133606 }, { "entropy": 8.849998474121094, "epoch": 0.28406169665809766, "mean_token_accuracy": 0.7469135522842407, "num_tokens": 15791941.0, "step": 2873, "train/ce_loss": 0.5951938033103943 }, { "epoch": 0.28406169665809766, "step": 2873, "train/sim_loss": 0.0625 }, { "epoch": 0.28406169665809766, "step": 2873, "train/total_loss": 0.12201938033103943 }, { "entropy": 9.189804077148438, "epoch": 0.2841605695076132, "mean_token_accuracy": 0.7718383073806763, "num_tokens": 15797343.0, "step": 2874, "train/ce_loss": 0.4805964231491089 }, { "epoch": 0.2841605695076132, "step": 2874, "train/sim_loss": 0.0390625 }, { "epoch": 0.2841605695076132, "step": 2874, "train/total_loss": 0.08712214231491089 }, { "entropy": 8.891167640686035, "epoch": 0.28425944235712874, "mean_token_accuracy": 0.7733333110809326, "num_tokens": 15802760.0, "step": 2875, "train/ce_loss": 0.5398122072219849 }, { "epoch": 0.28425944235712874, "step": 2875, "train/sim_loss": 0.046875 }, { "epoch": 0.28425944235712874, "step": 2875, "train/total_loss": 0.1008562222123146 }, { "entropy": 8.733278274536133, "epoch": 0.28435831520664423, "mean_token_accuracy": 0.812964916229248, "num_tokens": 15808317.0, "step": 2876, "train/ce_loss": 0.37547174096107483 }, { "epoch": 0.28435831520664423, "step": 2876, "train/sim_loss": 0.0234375 }, { "epoch": 0.28435831520664423, "step": 2876, "train/total_loss": 0.06098467484116554 }, { "entropy": 8.777840614318848, "epoch": 0.28445718805615977, "mean_token_accuracy": 0.7281845808029175, "num_tokens": 15813901.0, "step": 2877, "train/ce_loss": 0.7524556517601013 }, { "epoch": 0.28445718805615977, "step": 2877, "train/sim_loss": 0.0625 }, { "epoch": 0.28445718805615977, "step": 2877, "train/total_loss": 0.13774555921554565 }, { "entropy": 8.842784881591797, "epoch": 0.2845560609056753, "mean_token_accuracy": 0.7821466326713562, "num_tokens": 15819492.0, "step": 2878, "train/ce_loss": 0.6249029636383057 }, { "epoch": 0.2845560609056753, "step": 2878, "train/sim_loss": 0.06640625 }, { "epoch": 0.2845560609056753, "step": 2878, "train/total_loss": 0.1288965493440628 }, { "entropy": 9.062273025512695, "epoch": 0.2846549337551908, "mean_token_accuracy": 0.7585033774375916, "num_tokens": 15824958.0, "step": 2879, "train/ce_loss": 1.0444434881210327 }, { "epoch": 0.2846549337551908, "step": 2879, "train/sim_loss": 0.13671875 }, { "epoch": 0.2846549337551908, "step": 2879, "train/total_loss": 0.24116310477256775 }, { "epoch": 0.28475380660470634, "grad_norm": 0.9097966551780701, "learning_rate": 9.290659150472235e-06, "loss": 0.1381, "step": 2880 }, { "entropy": 9.142436027526855, "epoch": 0.28475380660470634, "mean_token_accuracy": 0.7516411542892456, "num_tokens": 15830414.0, "step": 2880, "train/ce_loss": 0.883201003074646 }, { "epoch": 0.28475380660470634, "step": 2880, "train/sim_loss": 0.125 }, { "epoch": 0.28475380660470634, "step": 2880, "train/total_loss": 0.21332010626792908 }, { "entropy": 9.301592826843262, "epoch": 0.2848526794542219, "mean_token_accuracy": 0.7467866539955139, "num_tokens": 15835763.0, "step": 2881, "train/ce_loss": 0.9369149208068848 }, { "epoch": 0.2848526794542219, "step": 2881, "train/sim_loss": 0.078125 }, { "epoch": 0.2848526794542219, "step": 2881, "train/total_loss": 0.17181649804115295 }, { "entropy": 9.149494171142578, "epoch": 0.2849515523037374, "mean_token_accuracy": 0.7291666865348816, "num_tokens": 15841221.0, "step": 2882, "train/ce_loss": 0.4768613576889038 }, { "epoch": 0.2849515523037374, "step": 2882, "train/sim_loss": 0.0625 }, { "epoch": 0.2849515523037374, "step": 2882, "train/total_loss": 0.1101861372590065 }, { "entropy": 8.895565032958984, "epoch": 0.2850504251532529, "mean_token_accuracy": 0.7642679810523987, "num_tokens": 15846694.0, "step": 2883, "train/ce_loss": 0.7473161816596985 }, { "epoch": 0.2850504251532529, "step": 2883, "train/sim_loss": 0.10546875 }, { "epoch": 0.2850504251532529, "step": 2883, "train/total_loss": 0.18020036816596985 }, { "entropy": 8.920212745666504, "epoch": 0.28514929800276845, "mean_token_accuracy": 0.7476140260696411, "num_tokens": 15852204.0, "step": 2884, "train/ce_loss": 1.0490585565567017 }, { "epoch": 0.28514929800276845, "step": 2884, "train/sim_loss": 0.03515625 }, { "epoch": 0.28514929800276845, "step": 2884, "train/total_loss": 0.1400621086359024 }, { "entropy": 8.908344268798828, "epoch": 0.285248170852284, "mean_token_accuracy": 0.7671381831169128, "num_tokens": 15857721.0, "step": 2885, "train/ce_loss": 0.543372392654419 }, { "epoch": 0.285248170852284, "step": 2885, "train/sim_loss": 0.02734375 }, { "epoch": 0.285248170852284, "step": 2885, "train/total_loss": 0.08168099075555801 }, { "entropy": 8.65149974822998, "epoch": 0.2853470437017995, "mean_token_accuracy": 0.7574170827865601, "num_tokens": 15863525.0, "step": 2886, "train/ce_loss": 0.77559494972229 }, { "epoch": 0.2853470437017995, "step": 2886, "train/sim_loss": 0.08984375 }, { "epoch": 0.2853470437017995, "step": 2886, "train/total_loss": 0.16740325093269348 }, { "entropy": 8.996011734008789, "epoch": 0.285445916551315, "mean_token_accuracy": 0.7494061589241028, "num_tokens": 15869014.0, "step": 2887, "train/ce_loss": 1.177527666091919 }, { "epoch": 0.285445916551315, "step": 2887, "train/sim_loss": 0.1171875 }, { "epoch": 0.285445916551315, "step": 2887, "train/total_loss": 0.23494026064872742 }, { "entropy": 8.829018592834473, "epoch": 0.28554478940083056, "mean_token_accuracy": 0.7068965435028076, "num_tokens": 15874576.0, "step": 2888, "train/ce_loss": 1.0056012868881226 }, { "epoch": 0.28554478940083056, "step": 2888, "train/sim_loss": 0.10546875 }, { "epoch": 0.28554478940083056, "step": 2888, "train/total_loss": 0.20602887868881226 }, { "entropy": 9.056051254272461, "epoch": 0.28564366225034604, "mean_token_accuracy": 0.8002232313156128, "num_tokens": 15880141.0, "step": 2889, "train/ce_loss": 0.825912594795227 }, { "epoch": 0.28564366225034604, "step": 2889, "train/sim_loss": 0.08203125 }, { "epoch": 0.28564366225034604, "step": 2889, "train/total_loss": 0.16462251543998718 }, { "entropy": 9.213738441467285, "epoch": 0.2857425350998616, "mean_token_accuracy": 0.7419753074645996, "num_tokens": 15885507.0, "step": 2890, "train/ce_loss": 0.5148308873176575 }, { "epoch": 0.2857425350998616, "step": 2890, "train/sim_loss": 0.05859375 }, { "epoch": 0.2857425350998616, "step": 2890, "train/total_loss": 0.11007684469223022 }, { "entropy": 9.251492500305176, "epoch": 0.2858414079493771, "mean_token_accuracy": 0.6961038708686829, "num_tokens": 15890870.0, "step": 2891, "train/ce_loss": 0.9854704737663269 }, { "epoch": 0.2858414079493771, "step": 2891, "train/sim_loss": 0.10546875 }, { "epoch": 0.2858414079493771, "step": 2891, "train/total_loss": 0.2040157914161682 }, { "entropy": 8.928323745727539, "epoch": 0.2859402807988926, "mean_token_accuracy": 0.7532994747161865, "num_tokens": 15896487.0, "step": 2892, "train/ce_loss": 0.6412045955657959 }, { "epoch": 0.2859402807988926, "step": 2892, "train/sim_loss": 0.02734375 }, { "epoch": 0.2859402807988926, "step": 2892, "train/total_loss": 0.09146421402692795 }, { "entropy": 8.710773468017578, "epoch": 0.28603915364840815, "mean_token_accuracy": 0.7851002812385559, "num_tokens": 15902176.0, "step": 2893, "train/ce_loss": 0.21104010939598083 }, { "epoch": 0.28603915364840815, "step": 2893, "train/sim_loss": 0.0234375 }, { "epoch": 0.28603915364840815, "step": 2893, "train/total_loss": 0.04454151168465614 }, { "entropy": 8.503534317016602, "epoch": 0.2861380264979237, "mean_token_accuracy": 0.7370156645774841, "num_tokens": 15907922.0, "step": 2894, "train/ce_loss": 0.5016263127326965 }, { "epoch": 0.2861380264979237, "step": 2894, "train/sim_loss": 0.0703125 }, { "epoch": 0.2861380264979237, "step": 2894, "train/total_loss": 0.12047512829303741 }, { "entropy": 8.922468185424805, "epoch": 0.2862368993474392, "mean_token_accuracy": 0.6945917010307312, "num_tokens": 15913467.0, "step": 2895, "train/ce_loss": 1.0181899070739746 }, { "epoch": 0.2862368993474392, "step": 2895, "train/sim_loss": 0.078125 }, { "epoch": 0.2862368993474392, "step": 2895, "train/total_loss": 0.1799439936876297 }, { "entropy": 9.133419036865234, "epoch": 0.2863357721969547, "mean_token_accuracy": 0.8072139024734497, "num_tokens": 15918874.0, "step": 2896, "train/ce_loss": 0.8135606646537781 }, { "epoch": 0.2863357721969547, "step": 2896, "train/sim_loss": 0.0546875 }, { "epoch": 0.2863357721969547, "step": 2896, "train/total_loss": 0.13604357838630676 }, { "entropy": 8.970626831054688, "epoch": 0.28643464504647026, "mean_token_accuracy": 0.7910602688789368, "num_tokens": 15924460.0, "step": 2897, "train/ce_loss": 0.7440759539604187 }, { "epoch": 0.28643464504647026, "step": 2897, "train/sim_loss": 0.08203125 }, { "epoch": 0.28643464504647026, "step": 2897, "train/total_loss": 0.15643885731697083 }, { "entropy": 9.278875350952148, "epoch": 0.28653351789598575, "mean_token_accuracy": 0.7818182110786438, "num_tokens": 15929828.0, "step": 2898, "train/ce_loss": 0.8908530473709106 }, { "epoch": 0.28653351789598575, "step": 2898, "train/sim_loss": 0.0390625 }, { "epoch": 0.28653351789598575, "step": 2898, "train/total_loss": 0.12814781069755554 }, { "entropy": 9.235715866088867, "epoch": 0.2866323907455013, "mean_token_accuracy": 0.7810026407241821, "num_tokens": 15935149.0, "step": 2899, "train/ce_loss": 0.6387487053871155 }, { "epoch": 0.2866323907455013, "step": 2899, "train/sim_loss": 0.078125 }, { "epoch": 0.2866323907455013, "step": 2899, "train/total_loss": 0.14199987053871155 }, { "epoch": 0.2867312635950168, "grad_norm": 0.7696937918663025, "learning_rate": 9.285714285714288e-06, "loss": 0.14, "step": 2900 }, { "entropy": 8.989718437194824, "epoch": 0.2867312635950168, "mean_token_accuracy": 0.7284210324287415, "num_tokens": 15941044.0, "step": 2900, "train/ce_loss": 0.8643961548805237 }, { "epoch": 0.2867312635950168, "step": 2900, "train/sim_loss": 0.0859375 }, { "epoch": 0.2867312635950168, "step": 2900, "train/total_loss": 0.1723771095275879 }, { "entropy": 9.09965991973877, "epoch": 0.2868301364445323, "mean_token_accuracy": 0.7496932744979858, "num_tokens": 15946454.0, "step": 2901, "train/ce_loss": 1.23293137550354 }, { "epoch": 0.2868301364445323, "step": 2901, "train/sim_loss": 0.04296875 }, { "epoch": 0.2868301364445323, "step": 2901, "train/total_loss": 0.16626188158988953 }, { "entropy": 8.907474517822266, "epoch": 0.28692900929404785, "mean_token_accuracy": 0.7524430155754089, "num_tokens": 15952038.0, "step": 2902, "train/ce_loss": 0.9235186576843262 }, { "epoch": 0.28692900929404785, "step": 2902, "train/sim_loss": 0.12109375 }, { "epoch": 0.28692900929404785, "step": 2902, "train/total_loss": 0.21344561874866486 }, { "entropy": 8.983824729919434, "epoch": 0.2870278821435634, "mean_token_accuracy": 0.7121034264564514, "num_tokens": 15957439.0, "step": 2903, "train/ce_loss": 1.0212366580963135 }, { "epoch": 0.2870278821435634, "step": 2903, "train/sim_loss": 0.08984375 }, { "epoch": 0.2870278821435634, "step": 2903, "train/total_loss": 0.1919674277305603 }, { "entropy": 8.996115684509277, "epoch": 0.2871267549930789, "mean_token_accuracy": 0.690773069858551, "num_tokens": 15962892.0, "step": 2904, "train/ce_loss": 1.864922046661377 }, { "epoch": 0.2871267549930789, "step": 2904, "train/sim_loss": 0.07421875 }, { "epoch": 0.2871267549930789, "step": 2904, "train/total_loss": 0.2607109546661377 }, { "entropy": 8.93629264831543, "epoch": 0.2872256278425944, "mean_token_accuracy": 0.7752403616905212, "num_tokens": 15968402.0, "step": 2905, "train/ce_loss": 0.7841704487800598 }, { "epoch": 0.2872256278425944, "step": 2905, "train/sim_loss": 0.09375 }, { "epoch": 0.2872256278425944, "step": 2905, "train/total_loss": 0.17216704785823822 }, { "entropy": 9.075109481811523, "epoch": 0.28732450069210996, "mean_token_accuracy": 0.7698744535446167, "num_tokens": 15973679.0, "step": 2906, "train/ce_loss": 0.5597262978553772 }, { "epoch": 0.28732450069210996, "step": 2906, "train/sim_loss": 0.04296875 }, { "epoch": 0.28732450069210996, "step": 2906, "train/total_loss": 0.0989413857460022 }, { "entropy": 8.998126983642578, "epoch": 0.28742337354162545, "mean_token_accuracy": 0.7303754091262817, "num_tokens": 15979171.0, "step": 2907, "train/ce_loss": 0.5770400166511536 }, { "epoch": 0.28742337354162545, "step": 2907, "train/sim_loss": 0.0625 }, { "epoch": 0.28742337354162545, "step": 2907, "train/total_loss": 0.12020400166511536 }, { "entropy": 8.922086715698242, "epoch": 0.287522246391141, "mean_token_accuracy": 0.787750780582428, "num_tokens": 15984729.0, "step": 2908, "train/ce_loss": 0.4732200801372528 }, { "epoch": 0.287522246391141, "step": 2908, "train/sim_loss": 0.0234375 }, { "epoch": 0.287522246391141, "step": 2908, "train/total_loss": 0.07075950503349304 }, { "entropy": 9.104681968688965, "epoch": 0.28762111924065653, "mean_token_accuracy": 0.7449495196342468, "num_tokens": 15990077.0, "step": 2909, "train/ce_loss": 0.8025069832801819 }, { "epoch": 0.28762111924065653, "step": 2909, "train/sim_loss": 0.0390625 }, { "epoch": 0.28762111924065653, "step": 2909, "train/total_loss": 0.11931320279836655 }, { "entropy": 9.264524459838867, "epoch": 0.287719992090172, "mean_token_accuracy": 0.7835325598716736, "num_tokens": 15995444.0, "step": 2910, "train/ce_loss": 0.6621771454811096 }, { "epoch": 0.287719992090172, "step": 2910, "train/sim_loss": 0.0390625 }, { "epoch": 0.287719992090172, "step": 2910, "train/total_loss": 0.10528021305799484 }, { "entropy": 9.541180610656738, "epoch": 0.28781886493968756, "mean_token_accuracy": 0.7578616142272949, "num_tokens": 16000721.0, "step": 2911, "train/ce_loss": 0.6623384952545166 }, { "epoch": 0.28781886493968756, "step": 2911, "train/sim_loss": 0.08984375 }, { "epoch": 0.28781886493968756, "step": 2911, "train/total_loss": 0.15607759356498718 }, { "entropy": 8.922453880310059, "epoch": 0.2879177377892031, "mean_token_accuracy": 0.7411477565765381, "num_tokens": 16006213.0, "step": 2912, "train/ce_loss": 0.6481139063835144 }, { "epoch": 0.2879177377892031, "step": 2912, "train/sim_loss": 0.06640625 }, { "epoch": 0.2879177377892031, "step": 2912, "train/total_loss": 0.13121764361858368 }, { "entropy": 8.856451034545898, "epoch": 0.2880166106387186, "mean_token_accuracy": 0.6926316022872925, "num_tokens": 16011852.0, "step": 2913, "train/ce_loss": 0.9316833019256592 }, { "epoch": 0.2880166106387186, "step": 2913, "train/sim_loss": 0.1484375 }, { "epoch": 0.2880166106387186, "step": 2913, "train/total_loss": 0.24160583317279816 }, { "entropy": 9.3110990524292, "epoch": 0.2881154834882341, "mean_token_accuracy": 0.7279411554336548, "num_tokens": 16017173.0, "step": 2914, "train/ce_loss": 1.3492203950881958 }, { "epoch": 0.2881154834882341, "step": 2914, "train/sim_loss": 0.07421875 }, { "epoch": 0.2881154834882341, "step": 2914, "train/total_loss": 0.20914079248905182 }, { "entropy": 9.097036361694336, "epoch": 0.28821435633774967, "mean_token_accuracy": 0.7372781038284302, "num_tokens": 16022644.0, "step": 2915, "train/ce_loss": 0.9184914827346802 }, { "epoch": 0.28821435633774967, "step": 2915, "train/sim_loss": 0.0390625 }, { "epoch": 0.28821435633774967, "step": 2915, "train/total_loss": 0.13091164827346802 }, { "entropy": 8.805046081542969, "epoch": 0.28831322918726515, "mean_token_accuracy": 0.7565582394599915, "num_tokens": 16028226.0, "step": 2916, "train/ce_loss": 0.34177541732788086 }, { "epoch": 0.28831322918726515, "step": 2916, "train/sim_loss": 0.015625 }, { "epoch": 0.28831322918726515, "step": 2916, "train/total_loss": 0.049802541732788086 }, { "entropy": 9.10892391204834, "epoch": 0.2884121020367807, "mean_token_accuracy": 0.7496671080589294, "num_tokens": 16033612.0, "step": 2917, "train/ce_loss": 0.8585209846496582 }, { "epoch": 0.2884121020367807, "step": 2917, "train/sim_loss": 0.0859375 }, { "epoch": 0.2884121020367807, "step": 2917, "train/total_loss": 0.17178960144519806 }, { "entropy": 8.972478866577148, "epoch": 0.28851097488629623, "mean_token_accuracy": 0.7408716082572937, "num_tokens": 16039102.0, "step": 2918, "train/ce_loss": 0.5948649048805237 }, { "epoch": 0.28851097488629623, "step": 2918, "train/sim_loss": 0.0703125 }, { "epoch": 0.28851097488629623, "step": 2918, "train/total_loss": 0.1297989934682846 }, { "entropy": 8.971837043762207, "epoch": 0.2886098477358117, "mean_token_accuracy": 0.7264038324356079, "num_tokens": 16044541.0, "step": 2919, "train/ce_loss": 1.1972358226776123 }, { "epoch": 0.2886098477358117, "step": 2919, "train/sim_loss": 0.1171875 }, { "epoch": 0.2886098477358117, "step": 2919, "train/total_loss": 0.2369110882282257 }, { "epoch": 0.28870872058532726, "grad_norm": 0.8549782633781433, "learning_rate": 9.280769420956338e-06, "loss": 0.1505, "step": 2920 }, { "entropy": 9.045795440673828, "epoch": 0.28870872058532726, "mean_token_accuracy": 0.773955762386322, "num_tokens": 16049997.0, "step": 2920, "train/ce_loss": 1.0599870681762695 }, { "epoch": 0.28870872058532726, "step": 2920, "train/sim_loss": 0.08984375 }, { "epoch": 0.28870872058532726, "step": 2920, "train/total_loss": 0.1958424597978592 }, { "entropy": 8.96976089477539, "epoch": 0.2888075934348428, "mean_token_accuracy": 0.7730900645256042, "num_tokens": 16055431.0, "step": 2921, "train/ce_loss": 0.811717689037323 }, { "epoch": 0.2888075934348428, "step": 2921, "train/sim_loss": 0.05859375 }, { "epoch": 0.2888075934348428, "step": 2921, "train/total_loss": 0.13976553082466125 }, { "entropy": 9.070856094360352, "epoch": 0.2889064662843583, "mean_token_accuracy": 0.771789014339447, "num_tokens": 16060939.0, "step": 2922, "train/ce_loss": 0.7894418835639954 }, { "epoch": 0.2889064662843583, "step": 2922, "train/sim_loss": 0.0625 }, { "epoch": 0.2889064662843583, "step": 2922, "train/total_loss": 0.14144419133663177 }, { "entropy": 8.800506591796875, "epoch": 0.28900533913387383, "mean_token_accuracy": 0.7260765433311462, "num_tokens": 16066434.0, "step": 2923, "train/ce_loss": 0.9343568086624146 }, { "epoch": 0.28900533913387383, "step": 2923, "train/sim_loss": 0.0703125 }, { "epoch": 0.28900533913387383, "step": 2923, "train/total_loss": 0.16374817490577698 }, { "entropy": 9.324902534484863, "epoch": 0.28910421198338937, "mean_token_accuracy": 0.7575360536575317, "num_tokens": 16071741.0, "step": 2924, "train/ce_loss": 0.6463718414306641 }, { "epoch": 0.28910421198338937, "step": 2924, "train/sim_loss": 0.07421875 }, { "epoch": 0.28910421198338937, "step": 2924, "train/total_loss": 0.1388559341430664 }, { "entropy": 9.152851104736328, "epoch": 0.2892030848329049, "mean_token_accuracy": 0.7344912886619568, "num_tokens": 16077102.0, "step": 2925, "train/ce_loss": 0.610232412815094 }, { "epoch": 0.2892030848329049, "step": 2925, "train/sim_loss": 0.0625 }, { "epoch": 0.2892030848329049, "step": 2925, "train/total_loss": 0.12352324277162552 }, { "entropy": 9.113920211791992, "epoch": 0.2893019576824204, "mean_token_accuracy": 0.7515451312065125, "num_tokens": 16082544.0, "step": 2926, "train/ce_loss": 0.6377256512641907 }, { "epoch": 0.2893019576824204, "step": 2926, "train/sim_loss": 0.0390625 }, { "epoch": 0.2893019576824204, "step": 2926, "train/total_loss": 0.10283506661653519 }, { "entropy": 9.343332290649414, "epoch": 0.28940083053193594, "mean_token_accuracy": 0.7799999713897705, "num_tokens": 16087889.0, "step": 2927, "train/ce_loss": 0.7978938221931458 }, { "epoch": 0.28940083053193594, "step": 2927, "train/sim_loss": 0.1015625 }, { "epoch": 0.28940083053193594, "step": 2927, "train/total_loss": 0.18135188519954681 }, { "entropy": 9.041967391967773, "epoch": 0.2894997033814515, "mean_token_accuracy": 0.6859296560287476, "num_tokens": 16093246.0, "step": 2928, "train/ce_loss": 0.8210824131965637 }, { "epoch": 0.2894997033814515, "step": 2928, "train/sim_loss": 0.078125 }, { "epoch": 0.2894997033814515, "step": 2928, "train/total_loss": 0.1602332442998886 }, { "entropy": 8.682047843933105, "epoch": 0.28959857623096696, "mean_token_accuracy": 0.7179487347602844, "num_tokens": 16099059.0, "step": 2929, "train/ce_loss": 0.4797767698764801 }, { "epoch": 0.28959857623096696, "step": 2929, "train/sim_loss": 0.0234375 }, { "epoch": 0.28959857623096696, "step": 2929, "train/total_loss": 0.07141517847776413 }, { "entropy": 9.062288284301758, "epoch": 0.2896974490804825, "mean_token_accuracy": 0.761966347694397, "num_tokens": 16104480.0, "step": 2930, "train/ce_loss": 0.720779299736023 }, { "epoch": 0.2896974490804825, "step": 2930, "train/sim_loss": 0.04296875 }, { "epoch": 0.2896974490804825, "step": 2930, "train/total_loss": 0.1150466799736023 }, { "entropy": 9.349395751953125, "epoch": 0.28979632192999805, "mean_token_accuracy": 0.675177276134491, "num_tokens": 16109748.0, "step": 2931, "train/ce_loss": 0.4960484206676483 }, { "epoch": 0.28979632192999805, "step": 2931, "train/sim_loss": 0.0859375 }, { "epoch": 0.28979632192999805, "step": 2931, "train/total_loss": 0.1355423480272293 }, { "entropy": 8.850358963012695, "epoch": 0.28989519477951353, "mean_token_accuracy": 0.7445759177207947, "num_tokens": 16115404.0, "step": 2932, "train/ce_loss": 0.6991676092147827 }, { "epoch": 0.28989519477951353, "step": 2932, "train/sim_loss": 0.02734375 }, { "epoch": 0.28989519477951353, "step": 2932, "train/total_loss": 0.09726051241159439 }, { "entropy": 8.951587677001953, "epoch": 0.2899940676290291, "mean_token_accuracy": 0.7946767807006836, "num_tokens": 16120797.0, "step": 2933, "train/ce_loss": 0.7948139905929565 }, { "epoch": 0.2899940676290291, "step": 2933, "train/sim_loss": 0.05859375 }, { "epoch": 0.2899940676290291, "step": 2933, "train/total_loss": 0.13807514309883118 }, { "entropy": 8.96328067779541, "epoch": 0.2900929404785446, "mean_token_accuracy": 0.7278414964675903, "num_tokens": 16126349.0, "step": 2934, "train/ce_loss": 0.49016788601875305 }, { "epoch": 0.2900929404785446, "step": 2934, "train/sim_loss": 0.078125 }, { "epoch": 0.2900929404785446, "step": 2934, "train/total_loss": 0.1271417886018753 }, { "entropy": 8.789773941040039, "epoch": 0.2901918133280601, "mean_token_accuracy": 0.757415235042572, "num_tokens": 16131971.0, "step": 2935, "train/ce_loss": 0.7580578923225403 }, { "epoch": 0.2901918133280601, "step": 2935, "train/sim_loss": 0.04296875 }, { "epoch": 0.2901918133280601, "step": 2935, "train/total_loss": 0.11877454072237015 }, { "entropy": 8.947824478149414, "epoch": 0.29029068617757564, "mean_token_accuracy": 0.7002236843109131, "num_tokens": 16137477.0, "step": 2936, "train/ce_loss": 0.9127503037452698 }, { "epoch": 0.29029068617757564, "step": 2936, "train/sim_loss": 0.09375 }, { "epoch": 0.29029068617757564, "step": 2936, "train/total_loss": 0.18502503633499146 }, { "entropy": 8.679365158081055, "epoch": 0.2903895590270912, "mean_token_accuracy": 0.7746331095695496, "num_tokens": 16143088.0, "step": 2937, "train/ce_loss": 1.129160761833191 }, { "epoch": 0.2903895590270912, "step": 2937, "train/sim_loss": 0.05859375 }, { "epoch": 0.2903895590270912, "step": 2937, "train/total_loss": 0.17150983214378357 }, { "entropy": 8.891587257385254, "epoch": 0.29048843187660667, "mean_token_accuracy": 0.6561264991760254, "num_tokens": 16148771.0, "step": 2938, "train/ce_loss": 0.763384222984314 }, { "epoch": 0.29048843187660667, "step": 2938, "train/sim_loss": 0.0546875 }, { "epoch": 0.29048843187660667, "step": 2938, "train/total_loss": 0.13102592527866364 }, { "entropy": 8.735174179077148, "epoch": 0.2905873047261222, "mean_token_accuracy": 0.7413213849067688, "num_tokens": 16154285.0, "step": 2939, "train/ce_loss": 0.8858610987663269 }, { "epoch": 0.2905873047261222, "step": 2939, "train/sim_loss": 0.03515625 }, { "epoch": 0.2905873047261222, "step": 2939, "train/total_loss": 0.12374236434698105 }, { "epoch": 0.29068617757563775, "grad_norm": 0.9504665732383728, "learning_rate": 9.275824556198389e-06, "loss": 0.1497, "step": 2940 }, { "entropy": 9.09089469909668, "epoch": 0.29068617757563775, "mean_token_accuracy": 0.7235668897628784, "num_tokens": 16159709.0, "step": 2940, "train/ce_loss": 0.5810391306877136 }, { "epoch": 0.29068617757563775, "step": 2940, "train/sim_loss": 0.10546875 }, { "epoch": 0.29068617757563775, "step": 2940, "train/total_loss": 0.16357266902923584 }, { "entropy": 8.97586441040039, "epoch": 0.29078505042515324, "mean_token_accuracy": 0.7479674816131592, "num_tokens": 16165198.0, "step": 2941, "train/ce_loss": 0.9296789765357971 }, { "epoch": 0.29078505042515324, "step": 2941, "train/sim_loss": 0.0703125 }, { "epoch": 0.29078505042515324, "step": 2941, "train/total_loss": 0.1632803976535797 }, { "entropy": 8.855178833007812, "epoch": 0.2908839232746688, "mean_token_accuracy": 0.7026143670082092, "num_tokens": 16170713.0, "step": 2942, "train/ce_loss": 0.6282690167427063 }, { "epoch": 0.2908839232746688, "step": 2942, "train/sim_loss": 0.05078125 }, { "epoch": 0.2908839232746688, "step": 2942, "train/total_loss": 0.11360815167427063 }, { "entropy": 9.018023490905762, "epoch": 0.2909827961241843, "mean_token_accuracy": 0.7420538067817688, "num_tokens": 16176158.0, "step": 2943, "train/ce_loss": 0.5659158229827881 }, { "epoch": 0.2909827961241843, "step": 2943, "train/sim_loss": 0.0546875 }, { "epoch": 0.2909827961241843, "step": 2943, "train/total_loss": 0.11127908527851105 }, { "entropy": 8.862997055053711, "epoch": 0.2910816689736998, "mean_token_accuracy": 0.7206572890281677, "num_tokens": 16181553.0, "step": 2944, "train/ce_loss": 1.6836292743682861 }, { "epoch": 0.2910816689736998, "step": 2944, "train/sim_loss": 0.1328125 }, { "epoch": 0.2910816689736998, "step": 2944, "train/total_loss": 0.30117541551589966 }, { "entropy": 9.03799819946289, "epoch": 0.29118054182321534, "mean_token_accuracy": 0.781879186630249, "num_tokens": 16187116.0, "step": 2945, "train/ce_loss": 0.4854666590690613 }, { "epoch": 0.29118054182321534, "step": 2945, "train/sim_loss": 0.01953125 }, { "epoch": 0.29118054182321534, "step": 2945, "train/total_loss": 0.0680779218673706 }, { "entropy": 8.813047409057617, "epoch": 0.2912794146727309, "mean_token_accuracy": 0.7644276022911072, "num_tokens": 16192805.0, "step": 2946, "train/ce_loss": 0.6287742853164673 }, { "epoch": 0.2912794146727309, "step": 2946, "train/sim_loss": 0.0703125 }, { "epoch": 0.2912794146727309, "step": 2946, "train/total_loss": 0.13318993151187897 }, { "entropy": 8.979032516479492, "epoch": 0.29137828752224637, "mean_token_accuracy": 0.6997578740119934, "num_tokens": 16198289.0, "step": 2947, "train/ce_loss": 1.5876655578613281 }, { "epoch": 0.29137828752224637, "step": 2947, "train/sim_loss": 0.05859375 }, { "epoch": 0.29137828752224637, "step": 2947, "train/total_loss": 0.21736030280590057 }, { "entropy": 9.167853355407715, "epoch": 0.2914771603717619, "mean_token_accuracy": 0.7025761008262634, "num_tokens": 16203707.0, "step": 2948, "train/ce_loss": 0.4092552959918976 }, { "epoch": 0.2914771603717619, "step": 2948, "train/sim_loss": 0.046875 }, { "epoch": 0.2914771603717619, "step": 2948, "train/total_loss": 0.087800532579422 }, { "entropy": 8.960417747497559, "epoch": 0.29157603322127745, "mean_token_accuracy": 0.6881987452507019, "num_tokens": 16209012.0, "step": 2949, "train/ce_loss": 1.1852591037750244 }, { "epoch": 0.29157603322127745, "step": 2949, "train/sim_loss": 0.10546875 }, { "epoch": 0.29157603322127745, "step": 2949, "train/total_loss": 0.2239946722984314 }, { "entropy": 8.959266662597656, "epoch": 0.29167490607079294, "mean_token_accuracy": 0.7649208307266235, "num_tokens": 16214450.0, "step": 2950, "train/ce_loss": 1.101574420928955 }, { "epoch": 0.29167490607079294, "step": 2950, "train/sim_loss": 0.0859375 }, { "epoch": 0.29167490607079294, "step": 2950, "train/total_loss": 0.19609494507312775 }, { "entropy": 8.963184356689453, "epoch": 0.2917737789203085, "mean_token_accuracy": 0.7098624110221863, "num_tokens": 16219935.0, "step": 2951, "train/ce_loss": 1.7133657932281494 }, { "epoch": 0.2917737789203085, "step": 2951, "train/sim_loss": 0.046875 }, { "epoch": 0.2917737789203085, "step": 2951, "train/total_loss": 0.2182115763425827 }, { "entropy": 8.904327392578125, "epoch": 0.291872651769824, "mean_token_accuracy": 0.7524644136428833, "num_tokens": 16225505.0, "step": 2952, "train/ce_loss": 0.34471333026885986 }, { "epoch": 0.291872651769824, "step": 2952, "train/sim_loss": 0.06640625 }, { "epoch": 0.291872651769824, "step": 2952, "train/total_loss": 0.10087758302688599 }, { "entropy": 9.143685340881348, "epoch": 0.2919715246193395, "mean_token_accuracy": 0.7732240557670593, "num_tokens": 16230867.0, "step": 2953, "train/ce_loss": 0.819869339466095 }, { "epoch": 0.2919715246193395, "step": 2953, "train/sim_loss": 0.0390625 }, { "epoch": 0.2919715246193395, "step": 2953, "train/total_loss": 0.1210494339466095 }, { "entropy": 9.220577239990234, "epoch": 0.29207039746885505, "mean_token_accuracy": 0.7678571343421936, "num_tokens": 16236209.0, "step": 2954, "train/ce_loss": 0.9559524655342102 }, { "epoch": 0.29207039746885505, "step": 2954, "train/sim_loss": 0.0234375 }, { "epoch": 0.29207039746885505, "step": 2954, "train/total_loss": 0.11903274804353714 }, { "entropy": 8.780811309814453, "epoch": 0.2921692703183706, "mean_token_accuracy": 0.7626942992210388, "num_tokens": 16241826.0, "step": 2955, "train/ce_loss": 0.6784602403640747 }, { "epoch": 0.2921692703183706, "step": 2955, "train/sim_loss": 0.046875 }, { "epoch": 0.2921692703183706, "step": 2955, "train/total_loss": 0.11472102254629135 }, { "entropy": 8.981622695922852, "epoch": 0.2922681431678861, "mean_token_accuracy": 0.7304643392562866, "num_tokens": 16247316.0, "step": 2956, "train/ce_loss": 0.9320663213729858 }, { "epoch": 0.2922681431678861, "step": 2956, "train/sim_loss": 0.109375 }, { "epoch": 0.2922681431678861, "step": 2956, "train/total_loss": 0.20258164405822754 }, { "entropy": 9.260414123535156, "epoch": 0.2923670160174016, "mean_token_accuracy": 0.7391910552978516, "num_tokens": 16252683.0, "step": 2957, "train/ce_loss": 1.0375072956085205 }, { "epoch": 0.2923670160174016, "step": 2957, "train/sim_loss": 0.046875 }, { "epoch": 0.2923670160174016, "step": 2957, "train/total_loss": 0.15062573552131653 }, { "entropy": 9.262503623962402, "epoch": 0.29246588886691716, "mean_token_accuracy": 0.7227585911750793, "num_tokens": 16258042.0, "step": 2958, "train/ce_loss": 0.6910064816474915 }, { "epoch": 0.29246588886691716, "step": 2958, "train/sim_loss": 0.05078125 }, { "epoch": 0.29246588886691716, "step": 2958, "train/total_loss": 0.11988189816474915 }, { "entropy": 9.08055305480957, "epoch": 0.29256476171643264, "mean_token_accuracy": 0.742895781993866, "num_tokens": 16263421.0, "step": 2959, "train/ce_loss": 0.4277103841304779 }, { "epoch": 0.29256476171643264, "step": 2959, "train/sim_loss": 0.0859375 }, { "epoch": 0.29256476171643264, "step": 2959, "train/total_loss": 0.12870854139328003 }, { "epoch": 0.2926636345659482, "grad_norm": 0.9318937063217163, "learning_rate": 9.270879691440439e-06, "loss": 0.1509, "step": 2960 }, { "entropy": 9.027239799499512, "epoch": 0.2926636345659482, "mean_token_accuracy": 0.7165449857711792, "num_tokens": 16268816.0, "step": 2960, "train/ce_loss": 0.8233897089958191 }, { "epoch": 0.2926636345659482, "step": 2960, "train/sim_loss": 0.046875 }, { "epoch": 0.2926636345659482, "step": 2960, "train/total_loss": 0.12921397387981415 }, { "entropy": 9.030412673950195, "epoch": 0.2927625074154637, "mean_token_accuracy": 0.7592829465866089, "num_tokens": 16274221.0, "step": 2961, "train/ce_loss": 0.4879069924354553 }, { "epoch": 0.2927625074154637, "step": 2961, "train/sim_loss": 0.0703125 }, { "epoch": 0.2927625074154637, "step": 2961, "train/total_loss": 0.11910320073366165 }, { "entropy": 8.939789772033691, "epoch": 0.2928613802649792, "mean_token_accuracy": 0.7257888913154602, "num_tokens": 16279699.0, "step": 2962, "train/ce_loss": 0.6610260605812073 }, { "epoch": 0.2928613802649792, "step": 2962, "train/sim_loss": 0.0703125 }, { "epoch": 0.2928613802649792, "step": 2962, "train/total_loss": 0.13641510903835297 }, { "entropy": 9.28176498413086, "epoch": 0.29296025311449475, "mean_token_accuracy": 0.7302013635635376, "num_tokens": 16285053.0, "step": 2963, "train/ce_loss": 0.6509261727333069 }, { "epoch": 0.29296025311449475, "step": 2963, "train/sim_loss": 0.01953125 }, { "epoch": 0.29296025311449475, "step": 2963, "train/total_loss": 0.08462386578321457 }, { "entropy": 9.088325500488281, "epoch": 0.2930591259640103, "mean_token_accuracy": 0.7605633735656738, "num_tokens": 16290438.0, "step": 2964, "train/ce_loss": 0.6102171540260315 }, { "epoch": 0.2930591259640103, "step": 2964, "train/sim_loss": 0.04296875 }, { "epoch": 0.2930591259640103, "step": 2964, "train/total_loss": 0.10399046540260315 }, { "entropy": 9.02169418334961, "epoch": 0.29315799881352583, "mean_token_accuracy": 0.740362823009491, "num_tokens": 16295904.0, "step": 2965, "train/ce_loss": 1.050837755203247 }, { "epoch": 0.29315799881352583, "step": 2965, "train/sim_loss": 0.11328125 }, { "epoch": 0.29315799881352583, "step": 2965, "train/total_loss": 0.21836502850055695 }, { "entropy": 8.672990798950195, "epoch": 0.2932568716630413, "mean_token_accuracy": 0.7324954867362976, "num_tokens": 16301758.0, "step": 2966, "train/ce_loss": 0.7352502346038818 }, { "epoch": 0.2932568716630413, "step": 2966, "train/sim_loss": 0.04296875 }, { "epoch": 0.2932568716630413, "step": 2966, "train/total_loss": 0.11649377644062042 }, { "entropy": 9.291250228881836, "epoch": 0.29335574451255686, "mean_token_accuracy": 0.7569955587387085, "num_tokens": 16306968.0, "step": 2967, "train/ce_loss": 0.8225297331809998 }, { "epoch": 0.29335574451255686, "step": 2967, "train/sim_loss": 0.08203125 }, { "epoch": 0.29335574451255686, "step": 2967, "train/total_loss": 0.16428422927856445 }, { "entropy": 9.12034797668457, "epoch": 0.2934546173620724, "mean_token_accuracy": 0.7649082541465759, "num_tokens": 16312401.0, "step": 2968, "train/ce_loss": 0.34501299262046814 }, { "epoch": 0.2934546173620724, "step": 2968, "train/sim_loss": 0.03125 }, { "epoch": 0.2934546173620724, "step": 2968, "train/total_loss": 0.06575129926204681 }, { "entropy": 9.178654670715332, "epoch": 0.2935534902115879, "mean_token_accuracy": 0.7606635093688965, "num_tokens": 16317868.0, "step": 2969, "train/ce_loss": 0.6053792834281921 }, { "epoch": 0.2935534902115879, "step": 2969, "train/sim_loss": 0.078125 }, { "epoch": 0.2935534902115879, "step": 2969, "train/total_loss": 0.1386629343032837 }, { "entropy": 8.207405090332031, "epoch": 0.2936523630611034, "mean_token_accuracy": 0.7473583221435547, "num_tokens": 16323552.0, "step": 2970, "train/ce_loss": 0.6979049444198608 }, { "epoch": 0.2936523630611034, "step": 2970, "train/sim_loss": 0.07421875 }, { "epoch": 0.2936523630611034, "step": 2970, "train/total_loss": 0.14400924742221832 }, { "entropy": 9.079988479614258, "epoch": 0.29375123591061897, "mean_token_accuracy": 0.7270194888114929, "num_tokens": 16328804.0, "step": 2971, "train/ce_loss": 0.7715155482292175 }, { "epoch": 0.29375123591061897, "step": 2971, "train/sim_loss": 0.0625 }, { "epoch": 0.29375123591061897, "step": 2971, "train/total_loss": 0.1396515667438507 }, { "entropy": 8.589347839355469, "epoch": 0.29385010876013445, "mean_token_accuracy": 0.7312961220741272, "num_tokens": 16334429.0, "step": 2972, "train/ce_loss": 1.3977593183517456 }, { "epoch": 0.29385010876013445, "step": 2972, "train/sim_loss": 0.07421875 }, { "epoch": 0.29385010876013445, "step": 2972, "train/total_loss": 0.21399468183517456 }, { "entropy": 8.986007690429688, "epoch": 0.29394898160965, "mean_token_accuracy": 0.7753883004188538, "num_tokens": 16339868.0, "step": 2973, "train/ce_loss": 0.6390880942344666 }, { "epoch": 0.29394898160965, "step": 2973, "train/sim_loss": 0.03515625 }, { "epoch": 0.29394898160965, "step": 2973, "train/total_loss": 0.09906505793333054 }, { "entropy": 9.07227897644043, "epoch": 0.29404785445916554, "mean_token_accuracy": 0.7971938848495483, "num_tokens": 16345248.0, "step": 2974, "train/ce_loss": 0.8721835017204285 }, { "epoch": 0.29404785445916554, "step": 2974, "train/sim_loss": 0.0859375 }, { "epoch": 0.29404785445916554, "step": 2974, "train/total_loss": 0.17315584421157837 }, { "entropy": 8.801551818847656, "epoch": 0.294146727308681, "mean_token_accuracy": 0.7341907620429993, "num_tokens": 16350832.0, "step": 2975, "train/ce_loss": 0.7170195579528809 }, { "epoch": 0.294146727308681, "step": 2975, "train/sim_loss": 0.0546875 }, { "epoch": 0.294146727308681, "step": 2975, "train/total_loss": 0.12638945877552032 }, { "entropy": 8.80726432800293, "epoch": 0.29424560015819656, "mean_token_accuracy": 0.7610701322555542, "num_tokens": 16356526.0, "step": 2976, "train/ce_loss": 0.3462585210800171 }, { "epoch": 0.29424560015819656, "step": 2976, "train/sim_loss": 0.01953125 }, { "epoch": 0.29424560015819656, "step": 2976, "train/total_loss": 0.05415710434317589 }, { "entropy": 9.316790580749512, "epoch": 0.2943444730077121, "mean_token_accuracy": 0.6990172266960144, "num_tokens": 16361952.0, "step": 2977, "train/ce_loss": 1.0662766695022583 }, { "epoch": 0.2943444730077121, "step": 2977, "train/sim_loss": 0.1015625 }, { "epoch": 0.2943444730077121, "step": 2977, "train/total_loss": 0.2081901729106903 }, { "entropy": 8.75393009185791, "epoch": 0.2944433458572276, "mean_token_accuracy": 0.7123420834541321, "num_tokens": 16367527.0, "step": 2978, "train/ce_loss": 1.9367862939834595 }, { "epoch": 0.2944433458572276, "step": 2978, "train/sim_loss": 0.0625 }, { "epoch": 0.2944433458572276, "step": 2978, "train/total_loss": 0.256178617477417 }, { "entropy": 9.297779083251953, "epoch": 0.29454221870674313, "mean_token_accuracy": 0.75789475440979, "num_tokens": 16372756.0, "step": 2979, "train/ce_loss": 1.1591320037841797 }, { "epoch": 0.29454221870674313, "step": 2979, "train/sim_loss": 0.0703125 }, { "epoch": 0.29454221870674313, "step": 2979, "train/total_loss": 0.18622571229934692 }, { "epoch": 0.29464109155625867, "grad_norm": 0.9256479740142822, "learning_rate": 9.265934826682491e-06, "loss": 0.1462, "step": 2980 }, { "entropy": 9.151491165161133, "epoch": 0.29464109155625867, "mean_token_accuracy": 0.7878378629684448, "num_tokens": 16378151.0, "step": 2980, "train/ce_loss": 0.4788009226322174 }, { "epoch": 0.29464109155625867, "step": 2980, "train/sim_loss": 0.0546875 }, { "epoch": 0.29464109155625867, "step": 2980, "train/total_loss": 0.10256759822368622 }, { "entropy": 9.06621265411377, "epoch": 0.29473996440577416, "mean_token_accuracy": 0.7724686861038208, "num_tokens": 16383745.0, "step": 2981, "train/ce_loss": 1.0098819732666016 }, { "epoch": 0.29473996440577416, "step": 2981, "train/sim_loss": 0.03125 }, { "epoch": 0.29473996440577416, "step": 2981, "train/total_loss": 0.1322382092475891 }, { "entropy": 8.675113677978516, "epoch": 0.2948388372552897, "mean_token_accuracy": 0.7619485259056091, "num_tokens": 16389515.0, "step": 2982, "train/ce_loss": 0.9904683232307434 }, { "epoch": 0.2948388372552897, "step": 2982, "train/sim_loss": 0.109375 }, { "epoch": 0.2948388372552897, "step": 2982, "train/total_loss": 0.20842182636260986 }, { "entropy": 9.1099271774292, "epoch": 0.29493771010480524, "mean_token_accuracy": 0.7052896618843079, "num_tokens": 16394875.0, "step": 2983, "train/ce_loss": 0.6730944514274597 }, { "epoch": 0.29493771010480524, "step": 2983, "train/sim_loss": 0.03515625 }, { "epoch": 0.29493771010480524, "step": 2983, "train/total_loss": 0.10246569663286209 }, { "entropy": 9.247035026550293, "epoch": 0.2950365829543207, "mean_token_accuracy": 0.7575757503509521, "num_tokens": 16400191.0, "step": 2984, "train/ce_loss": 0.5862237811088562 }, { "epoch": 0.2950365829543207, "step": 2984, "train/sim_loss": 0.04296875 }, { "epoch": 0.2950365829543207, "step": 2984, "train/total_loss": 0.10159112513065338 }, { "entropy": 9.056817054748535, "epoch": 0.29513545580383627, "mean_token_accuracy": 0.7420494556427002, "num_tokens": 16405685.0, "step": 2985, "train/ce_loss": 0.690122663974762 }, { "epoch": 0.29513545580383627, "step": 2985, "train/sim_loss": 0.15625 }, { "epoch": 0.29513545580383627, "step": 2985, "train/total_loss": 0.22526226937770844 }, { "entropy": 9.102200508117676, "epoch": 0.2952343286533518, "mean_token_accuracy": 0.7549824118614197, "num_tokens": 16411202.0, "step": 2986, "train/ce_loss": 1.0061630010604858 }, { "epoch": 0.2952343286533518, "step": 2986, "train/sim_loss": 0.0703125 }, { "epoch": 0.2952343286533518, "step": 2986, "train/total_loss": 0.17092880606651306 }, { "entropy": 8.908931732177734, "epoch": 0.2953332015028673, "mean_token_accuracy": 0.7787513732910156, "num_tokens": 16416688.0, "step": 2987, "train/ce_loss": 0.6617681384086609 }, { "epoch": 0.2953332015028673, "step": 2987, "train/sim_loss": 0.05859375 }, { "epoch": 0.2953332015028673, "step": 2987, "train/total_loss": 0.12477056682109833 }, { "entropy": 9.28451156616211, "epoch": 0.29543207435238283, "mean_token_accuracy": 0.7579213976860046, "num_tokens": 16422069.0, "step": 2988, "train/ce_loss": 1.0702464580535889 }, { "epoch": 0.29543207435238283, "step": 2988, "train/sim_loss": 0.08984375 }, { "epoch": 0.29543207435238283, "step": 2988, "train/total_loss": 0.1968683898448944 }, { "entropy": 9.06583023071289, "epoch": 0.2955309472018984, "mean_token_accuracy": 0.76106196641922, "num_tokens": 16427572.0, "step": 2989, "train/ce_loss": 0.7496554851531982 }, { "epoch": 0.2955309472018984, "step": 2989, "train/sim_loss": 0.08203125 }, { "epoch": 0.2955309472018984, "step": 2989, "train/total_loss": 0.15699680149555206 }, { "entropy": 8.733222961425781, "epoch": 0.29562982005141386, "mean_token_accuracy": 0.7472885251045227, "num_tokens": 16433082.0, "step": 2990, "train/ce_loss": 0.6620703935623169 }, { "epoch": 0.29562982005141386, "step": 2990, "train/sim_loss": 0.09375 }, { "epoch": 0.29562982005141386, "step": 2990, "train/total_loss": 0.15995705127716064 }, { "entropy": 8.957558631896973, "epoch": 0.2957286929009294, "mean_token_accuracy": 0.7585768699645996, "num_tokens": 16438539.0, "step": 2991, "train/ce_loss": 0.4927710294723511 }, { "epoch": 0.2957286929009294, "step": 2991, "train/sim_loss": 0.0625 }, { "epoch": 0.2957286929009294, "step": 2991, "train/total_loss": 0.11177710443735123 }, { "entropy": 9.201066017150879, "epoch": 0.29582756575044494, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 16443810.0, "step": 2992, "train/ce_loss": 0.9986974000930786 }, { "epoch": 0.29582756575044494, "step": 2992, "train/sim_loss": 0.046875 }, { "epoch": 0.29582756575044494, "step": 2992, "train/total_loss": 0.1467447429895401 }, { "entropy": 8.678627014160156, "epoch": 0.29592643859996043, "mean_token_accuracy": 0.716950535774231, "num_tokens": 16449525.0, "step": 2993, "train/ce_loss": 1.4996041059494019 }, { "epoch": 0.29592643859996043, "step": 2993, "train/sim_loss": 0.07421875 }, { "epoch": 0.29592643859996043, "step": 2993, "train/total_loss": 0.22417916357517242 }, { "entropy": 8.799978256225586, "epoch": 0.29602531144947597, "mean_token_accuracy": 0.7037433385848999, "num_tokens": 16455087.0, "step": 2994, "train/ce_loss": 0.7596735954284668 }, { "epoch": 0.29602531144947597, "step": 2994, "train/sim_loss": 0.0390625 }, { "epoch": 0.29602531144947597, "step": 2994, "train/total_loss": 0.11502986401319504 }, { "entropy": 9.000847816467285, "epoch": 0.2961241842989915, "mean_token_accuracy": 0.7126865386962891, "num_tokens": 16460534.0, "step": 2995, "train/ce_loss": 0.3392336368560791 }, { "epoch": 0.2961241842989915, "step": 2995, "train/sim_loss": 0.01953125 }, { "epoch": 0.2961241842989915, "step": 2995, "train/total_loss": 0.05345461517572403 }, { "entropy": 9.206645965576172, "epoch": 0.296223057148507, "mean_token_accuracy": 0.7412280440330505, "num_tokens": 16465798.0, "step": 2996, "train/ce_loss": 1.1677064895629883 }, { "epoch": 0.296223057148507, "step": 2996, "train/sim_loss": 0.01953125 }, { "epoch": 0.296223057148507, "step": 2996, "train/total_loss": 0.1363019049167633 }, { "entropy": 9.027484893798828, "epoch": 0.29632192999802254, "mean_token_accuracy": 0.7827102541923523, "num_tokens": 16471319.0, "step": 2997, "train/ce_loss": 0.49547505378723145 }, { "epoch": 0.29632192999802254, "step": 2997, "train/sim_loss": 0.0625 }, { "epoch": 0.29632192999802254, "step": 2997, "train/total_loss": 0.11204750835895538 }, { "entropy": 9.239718437194824, "epoch": 0.2964208028475381, "mean_token_accuracy": 0.7563587427139282, "num_tokens": 16476770.0, "step": 2998, "train/ce_loss": 1.5282140970230103 }, { "epoch": 0.2964208028475381, "step": 2998, "train/sim_loss": 0.1328125 }, { "epoch": 0.2964208028475381, "step": 2998, "train/total_loss": 0.28563392162323 }, { "entropy": 9.176612854003906, "epoch": 0.29651967569705356, "mean_token_accuracy": 0.7097591757774353, "num_tokens": 16482050.0, "step": 2999, "train/ce_loss": 0.7791761755943298 }, { "epoch": 0.29651967569705356, "step": 2999, "train/sim_loss": 0.0625 }, { "epoch": 0.29651967569705356, "step": 2999, "train/total_loss": 0.14041762053966522 }, { "epoch": 0.2966185485465691, "grad_norm": 0.7404053211212158, "learning_rate": 9.260989961924542e-06, "loss": 0.1424, "step": 3000 }, { "entropy": 8.75211238861084, "epoch": 0.2966185485465691, "mean_token_accuracy": 0.7609921097755432, "num_tokens": 16487584.0, "step": 3000, "train/ce_loss": 1.3104459047317505 }, { "epoch": 0.2966185485465691, "step": 3000, "train/sim_loss": 0.08203125 }, { "epoch": 0.2966185485465691, "step": 3000, "train/total_loss": 0.21307584643363953 }, { "entropy": 9.06614875793457, "epoch": 0.29671742139608465, "mean_token_accuracy": 0.7625298500061035, "num_tokens": 16493155.0, "step": 3001, "train/ce_loss": 0.6529265642166138 }, { "epoch": 0.29671742139608465, "step": 3001, "train/sim_loss": 0.0546875 }, { "epoch": 0.29671742139608465, "step": 3001, "train/total_loss": 0.11998015642166138 }, { "entropy": 8.5331449508667, "epoch": 0.29681629424560013, "mean_token_accuracy": 0.6991434693336487, "num_tokens": 16498699.0, "step": 3002, "train/ce_loss": 1.0547153949737549 }, { "epoch": 0.29681629424560013, "step": 3002, "train/sim_loss": 0.046875 }, { "epoch": 0.29681629424560013, "step": 3002, "train/total_loss": 0.15234655141830444 }, { "entropy": 8.987889289855957, "epoch": 0.2969151670951157, "mean_token_accuracy": 0.7331671118736267, "num_tokens": 16504133.0, "step": 3003, "train/ce_loss": 0.8472402095794678 }, { "epoch": 0.2969151670951157, "step": 3003, "train/sim_loss": 0.03515625 }, { "epoch": 0.2969151670951157, "step": 3003, "train/total_loss": 0.11988027393817902 }, { "entropy": 9.307809829711914, "epoch": 0.2970140399446312, "mean_token_accuracy": 0.7377049326896667, "num_tokens": 16509485.0, "step": 3004, "train/ce_loss": 0.49713826179504395 }, { "epoch": 0.2970140399446312, "step": 3004, "train/sim_loss": 0.0625 }, { "epoch": 0.2970140399446312, "step": 3004, "train/total_loss": 0.11221382766962051 }, { "entropy": 9.262511253356934, "epoch": 0.2971129127941467, "mean_token_accuracy": 0.7232375741004944, "num_tokens": 16514820.0, "step": 3005, "train/ce_loss": 0.8785149455070496 }, { "epoch": 0.2971129127941467, "step": 3005, "train/sim_loss": 0.078125 }, { "epoch": 0.2971129127941467, "step": 3005, "train/total_loss": 0.16597649455070496 }, { "entropy": 9.319053649902344, "epoch": 0.29721178564366224, "mean_token_accuracy": 0.7388688325881958, "num_tokens": 16520226.0, "step": 3006, "train/ce_loss": 0.8520228862762451 }, { "epoch": 0.29721178564366224, "step": 3006, "train/sim_loss": 0.046875 }, { "epoch": 0.29721178564366224, "step": 3006, "train/total_loss": 0.13207729160785675 }, { "entropy": 8.830718994140625, "epoch": 0.2973106584931778, "mean_token_accuracy": 0.7207207083702087, "num_tokens": 16525756.0, "step": 3007, "train/ce_loss": 0.5321862101554871 }, { "epoch": 0.2973106584931778, "step": 3007, "train/sim_loss": 0.046875 }, { "epoch": 0.2973106584931778, "step": 3007, "train/total_loss": 0.10009361803531647 }, { "entropy": 8.956975936889648, "epoch": 0.2974095313426933, "mean_token_accuracy": 0.7614555358886719, "num_tokens": 16531325.0, "step": 3008, "train/ce_loss": 0.7484789490699768 }, { "epoch": 0.2974095313426933, "step": 3008, "train/sim_loss": 0.109375 }, { "epoch": 0.2974095313426933, "step": 3008, "train/total_loss": 0.18422290682792664 }, { "entropy": 9.220966339111328, "epoch": 0.2975084041922088, "mean_token_accuracy": 0.7178477644920349, "num_tokens": 16536711.0, "step": 3009, "train/ce_loss": 1.7973988056182861 }, { "epoch": 0.2975084041922088, "step": 3009, "train/sim_loss": 0.0703125 }, { "epoch": 0.2975084041922088, "step": 3009, "train/total_loss": 0.25005239248275757 }, { "entropy": 9.01079273223877, "epoch": 0.29760727704172435, "mean_token_accuracy": 0.7351154088973999, "num_tokens": 16542131.0, "step": 3010, "train/ce_loss": 0.9730855226516724 }, { "epoch": 0.29760727704172435, "step": 3010, "train/sim_loss": 0.0546875 }, { "epoch": 0.29760727704172435, "step": 3010, "train/total_loss": 0.15199604630470276 }, { "entropy": 9.156578063964844, "epoch": 0.2977061498912399, "mean_token_accuracy": 0.7442143559455872, "num_tokens": 16547582.0, "step": 3011, "train/ce_loss": 0.8931745290756226 }, { "epoch": 0.2977061498912399, "step": 3011, "train/sim_loss": 0.109375 }, { "epoch": 0.2977061498912399, "step": 3011, "train/total_loss": 0.1986924558877945 }, { "entropy": 8.967742919921875, "epoch": 0.2978050227407554, "mean_token_accuracy": 0.7598152160644531, "num_tokens": 16553134.0, "step": 3012, "train/ce_loss": 0.450534850358963 }, { "epoch": 0.2978050227407554, "step": 3012, "train/sim_loss": 0.08984375 }, { "epoch": 0.2978050227407554, "step": 3012, "train/total_loss": 0.13489723205566406 }, { "entropy": 9.011186599731445, "epoch": 0.2979038955902709, "mean_token_accuracy": 0.7155963182449341, "num_tokens": 16558558.0, "step": 3013, "train/ce_loss": 0.6378522515296936 }, { "epoch": 0.2979038955902709, "step": 3013, "train/sim_loss": 0.0703125 }, { "epoch": 0.2979038955902709, "step": 3013, "train/total_loss": 0.13409772515296936 }, { "entropy": 8.421560287475586, "epoch": 0.29800276843978646, "mean_token_accuracy": 0.7080000042915344, "num_tokens": 16564352.0, "step": 3014, "train/ce_loss": 1.7380918264389038 }, { "epoch": 0.29800276843978646, "step": 3014, "train/sim_loss": 0.046875 }, { "epoch": 0.29800276843978646, "step": 3014, "train/total_loss": 0.22068418562412262 }, { "entropy": 9.047128677368164, "epoch": 0.29810164128930194, "mean_token_accuracy": 0.735154390335083, "num_tokens": 16569797.0, "step": 3015, "train/ce_loss": 1.1030935049057007 }, { "epoch": 0.29810164128930194, "step": 3015, "train/sim_loss": 0.109375 }, { "epoch": 0.29810164128930194, "step": 3015, "train/total_loss": 0.21968436241149902 }, { "entropy": 8.850757598876953, "epoch": 0.2982005141388175, "mean_token_accuracy": 0.7679283022880554, "num_tokens": 16575374.0, "step": 3016, "train/ce_loss": 0.5861003994941711 }, { "epoch": 0.2982005141388175, "step": 3016, "train/sim_loss": 0.0703125 }, { "epoch": 0.2982005141388175, "step": 3016, "train/total_loss": 0.12892253696918488 }, { "entropy": 9.326733589172363, "epoch": 0.298299386988333, "mean_token_accuracy": 0.7540740966796875, "num_tokens": 16580638.0, "step": 3017, "train/ce_loss": 0.2837834656238556 }, { "epoch": 0.298299386988333, "step": 3017, "train/sim_loss": 0.046875 }, { "epoch": 0.298299386988333, "step": 3017, "train/total_loss": 0.07525334507226944 }, { "entropy": 9.20815658569336, "epoch": 0.2983982598378485, "mean_token_accuracy": 0.7598425149917603, "num_tokens": 16586067.0, "step": 3018, "train/ce_loss": 1.196742296218872 }, { "epoch": 0.2983982598378485, "step": 3018, "train/sim_loss": 0.07421875 }, { "epoch": 0.2983982598378485, "step": 3018, "train/total_loss": 0.19389298558235168 }, { "entropy": 8.650609970092773, "epoch": 0.29849713268736405, "mean_token_accuracy": 0.7605956196784973, "num_tokens": 16591623.0, "step": 3019, "train/ce_loss": 1.3087620735168457 }, { "epoch": 0.29849713268736405, "step": 3019, "train/sim_loss": 0.046875 }, { "epoch": 0.29849713268736405, "step": 3019, "train/total_loss": 0.17775121331214905 }, { "epoch": 0.2985960055368796, "grad_norm": 0.811733603477478, "learning_rate": 9.256045097166592e-06, "loss": 0.1471, "step": 3020 }, { "entropy": 8.608394622802734, "epoch": 0.2985960055368796, "mean_token_accuracy": 0.73758864402771, "num_tokens": 16597147.0, "step": 3020, "train/ce_loss": 0.6753249764442444 }, { "epoch": 0.2985960055368796, "step": 3020, "train/sim_loss": 0.046875 }, { "epoch": 0.2985960055368796, "step": 3020, "train/total_loss": 0.1144075021147728 }, { "entropy": 9.079793930053711, "epoch": 0.2986948783863951, "mean_token_accuracy": 0.7650130391120911, "num_tokens": 16602560.0, "step": 3021, "train/ce_loss": 0.7294016480445862 }, { "epoch": 0.2986948783863951, "step": 3021, "train/sim_loss": 0.05078125 }, { "epoch": 0.2986948783863951, "step": 3021, "train/total_loss": 0.1237214133143425 }, { "entropy": 8.84538745880127, "epoch": 0.2987937512359106, "mean_token_accuracy": 0.7214285731315613, "num_tokens": 16608071.0, "step": 3022, "train/ce_loss": 0.4244869649410248 }, { "epoch": 0.2987937512359106, "step": 3022, "train/sim_loss": 0.08984375 }, { "epoch": 0.2987937512359106, "step": 3022, "train/total_loss": 0.13229244947433472 }, { "entropy": 9.184735298156738, "epoch": 0.29889262408542616, "mean_token_accuracy": 0.7420538067817688, "num_tokens": 16613498.0, "step": 3023, "train/ce_loss": 0.8869706988334656 }, { "epoch": 0.29889262408542616, "step": 3023, "train/sim_loss": 0.078125 }, { "epoch": 0.29889262408542616, "step": 3023, "train/total_loss": 0.16682207584381104 }, { "entropy": 9.501075744628906, "epoch": 0.29899149693494165, "mean_token_accuracy": 0.7447130084037781, "num_tokens": 16618727.0, "step": 3024, "train/ce_loss": 1.1041805744171143 }, { "epoch": 0.29899149693494165, "step": 3024, "train/sim_loss": 0.06640625 }, { "epoch": 0.29899149693494165, "step": 3024, "train/total_loss": 0.17682430148124695 }, { "entropy": 8.517784118652344, "epoch": 0.2990903697844572, "mean_token_accuracy": 0.6888715028762817, "num_tokens": 16624759.0, "step": 3025, "train/ce_loss": 0.7583527565002441 }, { "epoch": 0.2990903697844572, "step": 3025, "train/sim_loss": 0.12109375 }, { "epoch": 0.2990903697844572, "step": 3025, "train/total_loss": 0.19692903757095337 }, { "entropy": 9.077171325683594, "epoch": 0.29918924263397273, "mean_token_accuracy": 0.7142857313156128, "num_tokens": 16630210.0, "step": 3026, "train/ce_loss": 1.0449140071868896 }, { "epoch": 0.29918924263397273, "step": 3026, "train/sim_loss": 0.0546875 }, { "epoch": 0.29918924263397273, "step": 3026, "train/total_loss": 0.15917891263961792 }, { "entropy": 9.07779312133789, "epoch": 0.2992881154834882, "mean_token_accuracy": 0.7273838520050049, "num_tokens": 16635796.0, "step": 3027, "train/ce_loss": 0.6381266713142395 }, { "epoch": 0.2992881154834882, "step": 3027, "train/sim_loss": 0.06640625 }, { "epoch": 0.2992881154834882, "step": 3027, "train/total_loss": 0.13021892309188843 }, { "entropy": 8.678413391113281, "epoch": 0.29938698833300376, "mean_token_accuracy": 0.7433413863182068, "num_tokens": 16641523.0, "step": 3028, "train/ce_loss": 0.32183077931404114 }, { "epoch": 0.29938698833300376, "step": 3028, "train/sim_loss": 0.01953125 }, { "epoch": 0.29938698833300376, "step": 3028, "train/total_loss": 0.051714327186346054 }, { "entropy": 8.94394588470459, "epoch": 0.2994858611825193, "mean_token_accuracy": 0.7857935428619385, "num_tokens": 16647077.0, "step": 3029, "train/ce_loss": 0.5620577931404114 }, { "epoch": 0.2994858611825193, "step": 3029, "train/sim_loss": 0.0859375 }, { "epoch": 0.2994858611825193, "step": 3029, "train/total_loss": 0.14214327931404114 }, { "entropy": 9.105165481567383, "epoch": 0.2995847340320348, "mean_token_accuracy": 0.8126437067985535, "num_tokens": 16652528.0, "step": 3030, "train/ce_loss": 0.8655585050582886 }, { "epoch": 0.2995847340320348, "step": 3030, "train/sim_loss": 0.02734375 }, { "epoch": 0.2995847340320348, "step": 3030, "train/total_loss": 0.1138996034860611 }, { "entropy": 8.93304443359375, "epoch": 0.2996836068815503, "mean_token_accuracy": 0.741685152053833, "num_tokens": 16658110.0, "step": 3031, "train/ce_loss": 0.7050237059593201 }, { "epoch": 0.2996836068815503, "step": 3031, "train/sim_loss": 0.07421875 }, { "epoch": 0.2996836068815503, "step": 3031, "train/total_loss": 0.144721120595932 }, { "entropy": 8.802851676940918, "epoch": 0.29978247973106587, "mean_token_accuracy": 0.7788461446762085, "num_tokens": 16663927.0, "step": 3032, "train/ce_loss": 1.65800940990448 }, { "epoch": 0.29978247973106587, "step": 3032, "train/sim_loss": 0.0546875 }, { "epoch": 0.29978247973106587, "step": 3032, "train/total_loss": 0.22048844397068024 }, { "entropy": 8.88092041015625, "epoch": 0.29988135258058135, "mean_token_accuracy": 0.7843551635742188, "num_tokens": 16669460.0, "step": 3033, "train/ce_loss": 0.48213809728622437 }, { "epoch": 0.29988135258058135, "step": 3033, "train/sim_loss": 0.03125 }, { "epoch": 0.29988135258058135, "step": 3033, "train/total_loss": 0.07946380972862244 }, { "entropy": 9.217926025390625, "epoch": 0.2999802254300969, "mean_token_accuracy": 0.732903242111206, "num_tokens": 16674767.0, "step": 3034, "train/ce_loss": 0.9908206462860107 }, { "epoch": 0.2999802254300969, "step": 3034, "train/sim_loss": 0.05859375 }, { "epoch": 0.2999802254300969, "step": 3034, "train/total_loss": 0.1576758176088333 }, { "entropy": 8.564886093139648, "epoch": 0.30007909827961243, "mean_token_accuracy": 0.685606062412262, "num_tokens": 16680465.0, "step": 3035, "train/ce_loss": 1.934975266456604 }, { "epoch": 0.30007909827961243, "step": 3035, "train/sim_loss": 0.07421875 }, { "epoch": 0.30007909827961243, "step": 3035, "train/total_loss": 0.26771628856658936 }, { "entropy": 9.365888595581055, "epoch": 0.3001779711291279, "mean_token_accuracy": 0.7632312178611755, "num_tokens": 16685962.0, "step": 3036, "train/ce_loss": 1.369398593902588 }, { "epoch": 0.3001779711291279, "step": 3036, "train/sim_loss": 0.0859375 }, { "epoch": 0.3001779711291279, "step": 3036, "train/total_loss": 0.2228773683309555 }, { "entropy": 9.523872375488281, "epoch": 0.30027684397864346, "mean_token_accuracy": 0.7367668151855469, "num_tokens": 16691214.0, "step": 3037, "train/ce_loss": 1.401525616645813 }, { "epoch": 0.30027684397864346, "step": 3037, "train/sim_loss": 0.078125 }, { "epoch": 0.30027684397864346, "step": 3037, "train/total_loss": 0.21827755868434906 }, { "entropy": 9.159486770629883, "epoch": 0.300375716828159, "mean_token_accuracy": 0.7727891206741333, "num_tokens": 16696528.0, "step": 3038, "train/ce_loss": 0.5788246393203735 }, { "epoch": 0.300375716828159, "step": 3038, "train/sim_loss": 0.0703125 }, { "epoch": 0.300375716828159, "step": 3038, "train/total_loss": 0.12819495797157288 }, { "entropy": 9.092423439025879, "epoch": 0.3004745896776745, "mean_token_accuracy": 0.8162761926651001, "num_tokens": 16701921.0, "step": 3039, "train/ce_loss": 0.6399224400520325 }, { "epoch": 0.3004745896776745, "step": 3039, "train/sim_loss": 0.01953125 }, { "epoch": 0.3004745896776745, "step": 3039, "train/total_loss": 0.08352349698543549 }, { "epoch": 0.30057346252719, "grad_norm": 0.6732919216156006, "learning_rate": 9.251100232408645e-06, "loss": 0.1422, "step": 3040 }, { "entropy": 9.248085021972656, "epoch": 0.30057346252719, "mean_token_accuracy": 0.7307189702987671, "num_tokens": 16707345.0, "step": 3040, "train/ce_loss": 1.3986095190048218 }, { "epoch": 0.30057346252719, "step": 3040, "train/sim_loss": 0.078125 }, { "epoch": 0.30057346252719, "step": 3040, "train/total_loss": 0.21798595786094666 }, { "entropy": 8.909223556518555, "epoch": 0.30067233537670557, "mean_token_accuracy": 0.7371188402175903, "num_tokens": 16712955.0, "step": 3041, "train/ce_loss": 0.9023287296295166 }, { "epoch": 0.30067233537670557, "step": 3041, "train/sim_loss": 0.0546875 }, { "epoch": 0.30067233537670557, "step": 3041, "train/total_loss": 0.14492037892341614 }, { "entropy": 8.387813568115234, "epoch": 0.30077120822622105, "mean_token_accuracy": 0.6937553286552429, "num_tokens": 16718694.0, "step": 3042, "train/ce_loss": 1.8257883787155151 }, { "epoch": 0.30077120822622105, "step": 3042, "train/sim_loss": 0.0625 }, { "epoch": 0.30077120822622105, "step": 3042, "train/total_loss": 0.24507884681224823 }, { "entropy": 8.929826736450195, "epoch": 0.3008700810757366, "mean_token_accuracy": 0.754923403263092, "num_tokens": 16724083.0, "step": 3043, "train/ce_loss": 0.6239924430847168 }, { "epoch": 0.3008700810757366, "step": 3043, "train/sim_loss": 0.0859375 }, { "epoch": 0.3008700810757366, "step": 3043, "train/total_loss": 0.1483367383480072 }, { "entropy": 9.001482009887695, "epoch": 0.30096895392525214, "mean_token_accuracy": 0.7494145035743713, "num_tokens": 16729536.0, "step": 3044, "train/ce_loss": 0.5633025765419006 }, { "epoch": 0.30096895392525214, "step": 3044, "train/sim_loss": 0.05078125 }, { "epoch": 0.30096895392525214, "step": 3044, "train/total_loss": 0.10711151361465454 }, { "entropy": 8.968812942504883, "epoch": 0.3010678267747676, "mean_token_accuracy": 0.741813600063324, "num_tokens": 16734917.0, "step": 3045, "train/ce_loss": 0.9240159392356873 }, { "epoch": 0.3010678267747676, "step": 3045, "train/sim_loss": 0.0390625 }, { "epoch": 0.3010678267747676, "step": 3045, "train/total_loss": 0.13146409392356873 }, { "entropy": 8.790283203125, "epoch": 0.30116669962428316, "mean_token_accuracy": 0.7191630005836487, "num_tokens": 16740352.0, "step": 3046, "train/ce_loss": 0.8551459908485413 }, { "epoch": 0.30116669962428316, "step": 3046, "train/sim_loss": 0.1015625 }, { "epoch": 0.30116669962428316, "step": 3046, "train/total_loss": 0.1870771050453186 }, { "entropy": 8.679811477661133, "epoch": 0.3012655724737987, "mean_token_accuracy": 0.6719716787338257, "num_tokens": 16746007.0, "step": 3047, "train/ce_loss": 0.9280754327774048 }, { "epoch": 0.3012655724737987, "step": 3047, "train/sim_loss": 0.0859375 }, { "epoch": 0.3012655724737987, "step": 3047, "train/total_loss": 0.17874504625797272 }, { "entropy": 8.704329490661621, "epoch": 0.30136444532331425, "mean_token_accuracy": 0.7955307364463806, "num_tokens": 16751598.0, "step": 3048, "train/ce_loss": 0.2840290665626526 }, { "epoch": 0.30136444532331425, "step": 3048, "train/sim_loss": 0.0234375 }, { "epoch": 0.30136444532331425, "step": 3048, "train/total_loss": 0.0518404096364975 }, { "entropy": 9.252098083496094, "epoch": 0.30146331817282973, "mean_token_accuracy": 0.7450381517410278, "num_tokens": 16756864.0, "step": 3049, "train/ce_loss": 0.6959392428398132 }, { "epoch": 0.30146331817282973, "step": 3049, "train/sim_loss": 0.0625 }, { "epoch": 0.30146331817282973, "step": 3049, "train/total_loss": 0.13209393620491028 }, { "entropy": 9.101897239685059, "epoch": 0.30156219102234527, "mean_token_accuracy": 0.7430555820465088, "num_tokens": 16762285.0, "step": 3050, "train/ce_loss": 0.8570864796638489 }, { "epoch": 0.30156219102234527, "step": 3050, "train/sim_loss": 0.0859375 }, { "epoch": 0.30156219102234527, "step": 3050, "train/total_loss": 0.1716461479663849 }, { "entropy": 8.84647274017334, "epoch": 0.3016610638718608, "mean_token_accuracy": 0.7653806209564209, "num_tokens": 16767811.0, "step": 3051, "train/ce_loss": 1.526676058769226 }, { "epoch": 0.3016610638718608, "step": 3051, "train/sim_loss": 0.109375 }, { "epoch": 0.3016610638718608, "step": 3051, "train/total_loss": 0.2620426118373871 }, { "entropy": 9.059497833251953, "epoch": 0.3017599367213763, "mean_token_accuracy": 0.757080614566803, "num_tokens": 16773297.0, "step": 3052, "train/ce_loss": 0.9478781223297119 }, { "epoch": 0.3017599367213763, "step": 3052, "train/sim_loss": 0.03125 }, { "epoch": 0.3017599367213763, "step": 3052, "train/total_loss": 0.1260378062725067 }, { "entropy": 9.119873046875, "epoch": 0.30185880957089184, "mean_token_accuracy": 0.7950170040130615, "num_tokens": 16778780.0, "step": 3053, "train/ce_loss": 0.6432309746742249 }, { "epoch": 0.30185880957089184, "step": 3053, "train/sim_loss": 0.046875 }, { "epoch": 0.30185880957089184, "step": 3053, "train/total_loss": 0.11119809746742249 }, { "entropy": 9.184306144714355, "epoch": 0.3019576824204074, "mean_token_accuracy": 0.7727825045585632, "num_tokens": 16784106.0, "step": 3054, "train/ce_loss": 0.6364697217941284 }, { "epoch": 0.3019576824204074, "step": 3054, "train/sim_loss": 0.0703125 }, { "epoch": 0.3019576824204074, "step": 3054, "train/total_loss": 0.13395947217941284 }, { "entropy": 8.672124862670898, "epoch": 0.30205655526992287, "mean_token_accuracy": 0.7318142056465149, "num_tokens": 16789808.0, "step": 3055, "train/ce_loss": 0.40968021750450134 }, { "epoch": 0.30205655526992287, "step": 3055, "train/sim_loss": 0.0234375 }, { "epoch": 0.30205655526992287, "step": 3055, "train/total_loss": 0.06440552324056625 }, { "entropy": 8.881759643554688, "epoch": 0.3021554281194384, "mean_token_accuracy": 0.7017353773117065, "num_tokens": 16795348.0, "step": 3056, "train/ce_loss": 1.3866326808929443 }, { "epoch": 0.3021554281194384, "step": 3056, "train/sim_loss": 0.1328125 }, { "epoch": 0.3021554281194384, "step": 3056, "train/total_loss": 0.27147579193115234 }, { "entropy": 8.540261268615723, "epoch": 0.30225430096895395, "mean_token_accuracy": 0.746999979019165, "num_tokens": 16801023.0, "step": 3057, "train/ce_loss": 0.5674982070922852 }, { "epoch": 0.30225430096895395, "step": 3057, "train/sim_loss": 0.078125 }, { "epoch": 0.30225430096895395, "step": 3057, "train/total_loss": 0.13487482070922852 }, { "entropy": 8.960662841796875, "epoch": 0.30235317381846943, "mean_token_accuracy": 0.7842907309532166, "num_tokens": 16806472.0, "step": 3058, "train/ce_loss": 0.7196112275123596 }, { "epoch": 0.30235317381846943, "step": 3058, "train/sim_loss": 0.05859375 }, { "epoch": 0.30235317381846943, "step": 3058, "train/total_loss": 0.13055488467216492 }, { "entropy": 9.083641052246094, "epoch": 0.302452046667985, "mean_token_accuracy": 0.7856225967407227, "num_tokens": 16812100.0, "step": 3059, "train/ce_loss": 0.47349271178245544 }, { "epoch": 0.302452046667985, "step": 3059, "train/sim_loss": 0.1171875 }, { "epoch": 0.302452046667985, "step": 3059, "train/total_loss": 0.16453677415847778 }, { "epoch": 0.3025509195175005, "grad_norm": 0.8660598397254944, "learning_rate": 9.246155367650695e-06, "loss": 0.1398, "step": 3060 }, { "entropy": 8.707700729370117, "epoch": 0.3025509195175005, "mean_token_accuracy": 0.7022094130516052, "num_tokens": 16817762.0, "step": 3060, "train/ce_loss": 0.9139187932014465 }, { "epoch": 0.3025509195175005, "step": 3060, "train/sim_loss": 0.0703125 }, { "epoch": 0.3025509195175005, "step": 3060, "train/total_loss": 0.1617043912410736 }, { "entropy": 9.1298189163208, "epoch": 0.302649792367016, "mean_token_accuracy": 0.734375, "num_tokens": 16823169.0, "step": 3061, "train/ce_loss": 0.8268902897834778 }, { "epoch": 0.302649792367016, "step": 3061, "train/sim_loss": 0.04296875 }, { "epoch": 0.302649792367016, "step": 3061, "train/total_loss": 0.12565778195858002 }, { "entropy": 8.835563659667969, "epoch": 0.30274866521653154, "mean_token_accuracy": 0.7260273694992065, "num_tokens": 16828799.0, "step": 3062, "train/ce_loss": 0.9456378817558289 }, { "epoch": 0.30274866521653154, "step": 3062, "train/sim_loss": 0.046875 }, { "epoch": 0.30274866521653154, "step": 3062, "train/total_loss": 0.1414387822151184 }, { "entropy": 8.938909530639648, "epoch": 0.3028475380660471, "mean_token_accuracy": 0.772042989730835, "num_tokens": 16834316.0, "step": 3063, "train/ce_loss": 0.7499306797981262 }, { "epoch": 0.3028475380660471, "step": 3063, "train/sim_loss": 0.02734375 }, { "epoch": 0.3028475380660471, "step": 3063, "train/total_loss": 0.1023368164896965 }, { "entropy": 9.190184593200684, "epoch": 0.30294641091556257, "mean_token_accuracy": 0.7473841309547424, "num_tokens": 16839632.0, "step": 3064, "train/ce_loss": 0.6211099028587341 }, { "epoch": 0.30294641091556257, "step": 3064, "train/sim_loss": 0.09765625 }, { "epoch": 0.30294641091556257, "step": 3064, "train/total_loss": 0.1597672402858734 }, { "entropy": 9.474180221557617, "epoch": 0.3030452837650781, "mean_token_accuracy": 0.7402985095977783, "num_tokens": 16844844.0, "step": 3065, "train/ce_loss": 0.5495256781578064 }, { "epoch": 0.3030452837650781, "step": 3065, "train/sim_loss": 0.1015625 }, { "epoch": 0.3030452837650781, "step": 3065, "train/total_loss": 0.15651506185531616 }, { "entropy": 9.074810028076172, "epoch": 0.30314415661459365, "mean_token_accuracy": 0.7259684205055237, "num_tokens": 16850216.0, "step": 3066, "train/ce_loss": 1.139854907989502 }, { "epoch": 0.30314415661459365, "step": 3066, "train/sim_loss": 0.05859375 }, { "epoch": 0.30314415661459365, "step": 3066, "train/total_loss": 0.17257924377918243 }, { "entropy": 8.913167953491211, "epoch": 0.30324302946410914, "mean_token_accuracy": 0.7485311627388, "num_tokens": 16855658.0, "step": 3067, "train/ce_loss": 0.6959477663040161 }, { "epoch": 0.30324302946410914, "step": 3067, "train/sim_loss": 0.10546875 }, { "epoch": 0.30324302946410914, "step": 3067, "train/total_loss": 0.17506352066993713 }, { "entropy": 8.874805450439453, "epoch": 0.3033419023136247, "mean_token_accuracy": 0.7800687551498413, "num_tokens": 16861104.0, "step": 3068, "train/ce_loss": 0.5206518173217773 }, { "epoch": 0.3033419023136247, "step": 3068, "train/sim_loss": 0.01953125 }, { "epoch": 0.3033419023136247, "step": 3068, "train/total_loss": 0.0715964287519455 }, { "entropy": 9.375828742980957, "epoch": 0.3034407751631402, "mean_token_accuracy": 0.796187698841095, "num_tokens": 16866363.0, "step": 3069, "train/ce_loss": 0.5849049687385559 }, { "epoch": 0.3034407751631402, "step": 3069, "train/sim_loss": 0.078125 }, { "epoch": 0.3034407751631402, "step": 3069, "train/total_loss": 0.13661549985408783 }, { "entropy": 8.349678993225098, "epoch": 0.3035396480126557, "mean_token_accuracy": 0.779534101486206, "num_tokens": 16872209.0, "step": 3070, "train/ce_loss": 0.2649896740913391 }, { "epoch": 0.3035396480126557, "step": 3070, "train/sim_loss": 0.0703125 }, { "epoch": 0.3035396480126557, "step": 3070, "train/total_loss": 0.09681146591901779 }, { "entropy": 8.665989875793457, "epoch": 0.30363852086217125, "mean_token_accuracy": 0.7717717885971069, "num_tokens": 16877891.0, "step": 3071, "train/ce_loss": 0.650156557559967 }, { "epoch": 0.30363852086217125, "step": 3071, "train/sim_loss": 0.078125 }, { "epoch": 0.30363852086217125, "step": 3071, "train/total_loss": 0.14314065873622894 }, { "entropy": 9.134130477905273, "epoch": 0.3037373937116868, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 16883310.0, "step": 3072, "train/ce_loss": 1.0597552061080933 }, { "epoch": 0.3037373937116868, "step": 3072, "train/sim_loss": 0.01953125 }, { "epoch": 0.3037373937116868, "step": 3072, "train/total_loss": 0.12550677359104156 }, { "entropy": 8.486385345458984, "epoch": 0.3038362665612023, "mean_token_accuracy": 0.7537453174591064, "num_tokens": 16889069.0, "step": 3073, "train/ce_loss": 0.7385778427124023 }, { "epoch": 0.3038362665612023, "step": 3073, "train/sim_loss": 0.08203125 }, { "epoch": 0.3038362665612023, "step": 3073, "train/total_loss": 0.15588903427124023 }, { "entropy": 8.68948745727539, "epoch": 0.3039351394107178, "mean_token_accuracy": 0.7425083518028259, "num_tokens": 16894628.0, "step": 3074, "train/ce_loss": 1.246899962425232 }, { "epoch": 0.3039351394107178, "step": 3074, "train/sim_loss": 0.09765625 }, { "epoch": 0.3039351394107178, "step": 3074, "train/total_loss": 0.2223462462425232 }, { "entropy": 9.103607177734375, "epoch": 0.30403401226023336, "mean_token_accuracy": 0.7974193692207336, "num_tokens": 16899988.0, "step": 3075, "train/ce_loss": 0.42619428038597107 }, { "epoch": 0.30403401226023336, "step": 3075, "train/sim_loss": 0.05859375 }, { "epoch": 0.30403401226023336, "step": 3075, "train/total_loss": 0.10121317952871323 }, { "entropy": 8.951127052307129, "epoch": 0.30413288510974884, "mean_token_accuracy": 0.6883720755577087, "num_tokens": 16905421.0, "step": 3076, "train/ce_loss": 0.5841593742370605 }, { "epoch": 0.30413288510974884, "step": 3076, "train/sim_loss": 0.05078125 }, { "epoch": 0.30413288510974884, "step": 3076, "train/total_loss": 0.10919718444347382 }, { "entropy": 8.978059768676758, "epoch": 0.3042317579592644, "mean_token_accuracy": 0.7644539475440979, "num_tokens": 16910880.0, "step": 3077, "train/ce_loss": 1.2236387729644775 }, { "epoch": 0.3042317579592644, "step": 3077, "train/sim_loss": 0.09765625 }, { "epoch": 0.3042317579592644, "step": 3077, "train/total_loss": 0.22002013027668 }, { "entropy": 8.757209777832031, "epoch": 0.3043306308087799, "mean_token_accuracy": 0.678670346736908, "num_tokens": 16916517.0, "step": 3078, "train/ce_loss": 0.5801843404769897 }, { "epoch": 0.3043306308087799, "step": 3078, "train/sim_loss": 0.1171875 }, { "epoch": 0.3043306308087799, "step": 3078, "train/total_loss": 0.17520593106746674 }, { "entropy": 9.020451545715332, "epoch": 0.3044295036582954, "mean_token_accuracy": 0.7188703417778015, "num_tokens": 16921958.0, "step": 3079, "train/ce_loss": 0.8600640892982483 }, { "epoch": 0.3044295036582954, "step": 3079, "train/sim_loss": 0.02734375 }, { "epoch": 0.3044295036582954, "step": 3079, "train/total_loss": 0.11335016041994095 }, { "epoch": 0.30452837650781095, "grad_norm": 0.9304722547531128, "learning_rate": 9.241210502892747e-06, "loss": 0.1429, "step": 3080 }, { "entropy": 9.053438186645508, "epoch": 0.30452837650781095, "mean_token_accuracy": 0.7123456597328186, "num_tokens": 16927361.0, "step": 3080, "train/ce_loss": 0.8182748556137085 }, { "epoch": 0.30452837650781095, "step": 3080, "train/sim_loss": 0.05078125 }, { "epoch": 0.30452837650781095, "step": 3080, "train/total_loss": 0.13260874152183533 }, { "entropy": 8.644874572753906, "epoch": 0.3046272493573265, "mean_token_accuracy": 0.7028518915176392, "num_tokens": 16933109.0, "step": 3081, "train/ce_loss": 0.6460102200508118 }, { "epoch": 0.3046272493573265, "step": 3081, "train/sim_loss": 0.0546875 }, { "epoch": 0.3046272493573265, "step": 3081, "train/total_loss": 0.11928852647542953 }, { "entropy": 9.03573989868164, "epoch": 0.304726122206842, "mean_token_accuracy": 0.7210884094238281, "num_tokens": 16938558.0, "step": 3082, "train/ce_loss": 0.6661242842674255 }, { "epoch": 0.304726122206842, "step": 3082, "train/sim_loss": 0.0390625 }, { "epoch": 0.304726122206842, "step": 3082, "train/total_loss": 0.10567492991685867 }, { "entropy": 8.981025695800781, "epoch": 0.3048249950563575, "mean_token_accuracy": 0.7749999761581421, "num_tokens": 16944056.0, "step": 3083, "train/ce_loss": 0.41553300619125366 }, { "epoch": 0.3048249950563575, "step": 3083, "train/sim_loss": 0.05859375 }, { "epoch": 0.3048249950563575, "step": 3083, "train/total_loss": 0.1001470535993576 }, { "entropy": 9.194904327392578, "epoch": 0.30492386790587306, "mean_token_accuracy": 0.730867326259613, "num_tokens": 16949468.0, "step": 3084, "train/ce_loss": 1.0826817750930786 }, { "epoch": 0.30492386790587306, "step": 3084, "train/sim_loss": 0.05078125 }, { "epoch": 0.30492386790587306, "step": 3084, "train/total_loss": 0.15904942154884338 }, { "entropy": 9.145282745361328, "epoch": 0.30502274075538854, "mean_token_accuracy": 0.7862318754196167, "num_tokens": 16954860.0, "step": 3085, "train/ce_loss": 0.7905389070510864 }, { "epoch": 0.30502274075538854, "step": 3085, "train/sim_loss": 0.03515625 }, { "epoch": 0.30502274075538854, "step": 3085, "train/total_loss": 0.11421014368534088 }, { "entropy": 8.905024528503418, "epoch": 0.3051216136049041, "mean_token_accuracy": 0.701179563999176, "num_tokens": 16960247.0, "step": 3086, "train/ce_loss": 0.9506229758262634 }, { "epoch": 0.3051216136049041, "step": 3086, "train/sim_loss": 0.05078125 }, { "epoch": 0.3051216136049041, "step": 3086, "train/total_loss": 0.14584355056285858 }, { "entropy": 9.265642166137695, "epoch": 0.3052204864544196, "mean_token_accuracy": 0.7131882309913635, "num_tokens": 16965577.0, "step": 3087, "train/ce_loss": 0.5757442116737366 }, { "epoch": 0.3052204864544196, "step": 3087, "train/sim_loss": 0.0625 }, { "epoch": 0.3052204864544196, "step": 3087, "train/total_loss": 0.12007442116737366 }, { "entropy": 9.108185768127441, "epoch": 0.3053193593039351, "mean_token_accuracy": 0.7755581736564636, "num_tokens": 16970990.0, "step": 3088, "train/ce_loss": 0.46007466316223145 }, { "epoch": 0.3053193593039351, "step": 3088, "train/sim_loss": 0.015625 }, { "epoch": 0.3053193593039351, "step": 3088, "train/total_loss": 0.061632465571165085 }, { "entropy": 8.726178169250488, "epoch": 0.30541823215345065, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 16976527.0, "step": 3089, "train/ce_loss": 1.2618165016174316 }, { "epoch": 0.30541823215345065, "step": 3089, "train/sim_loss": 0.1015625 }, { "epoch": 0.30541823215345065, "step": 3089, "train/total_loss": 0.22774414718151093 }, { "entropy": 8.977117538452148, "epoch": 0.3055171050029662, "mean_token_accuracy": 0.7525083422660828, "num_tokens": 16982105.0, "step": 3090, "train/ce_loss": 0.9796067476272583 }, { "epoch": 0.3055171050029662, "step": 3090, "train/sim_loss": 0.0703125 }, { "epoch": 0.3055171050029662, "step": 3090, "train/total_loss": 0.1682731807231903 }, { "entropy": 8.717645645141602, "epoch": 0.30561597785248173, "mean_token_accuracy": 0.7397540807723999, "num_tokens": 16987725.0, "step": 3091, "train/ce_loss": 0.5028952360153198 }, { "epoch": 0.30561597785248173, "step": 3091, "train/sim_loss": 0.0390625 }, { "epoch": 0.30561597785248173, "step": 3091, "train/total_loss": 0.08935202658176422 }, { "entropy": 9.01725959777832, "epoch": 0.3057148507019972, "mean_token_accuracy": 0.6815145015716553, "num_tokens": 16993288.0, "step": 3092, "train/ce_loss": 0.5624918341636658 }, { "epoch": 0.3057148507019972, "step": 3092, "train/sim_loss": 0.09375 }, { "epoch": 0.3057148507019972, "step": 3092, "train/total_loss": 0.14999918639659882 }, { "entropy": 8.769834518432617, "epoch": 0.30581372355151276, "mean_token_accuracy": 0.7649903297424316, "num_tokens": 16998917.0, "step": 3093, "train/ce_loss": 0.4579542875289917 }, { "epoch": 0.30581372355151276, "step": 3093, "train/sim_loss": 0.02734375 }, { "epoch": 0.30581372355151276, "step": 3093, "train/total_loss": 0.07313917577266693 }, { "entropy": 9.386171340942383, "epoch": 0.3059125964010283, "mean_token_accuracy": 0.7055555582046509, "num_tokens": 17004133.0, "step": 3094, "train/ce_loss": 1.2718251943588257 }, { "epoch": 0.3059125964010283, "step": 3094, "train/sim_loss": 0.109375 }, { "epoch": 0.3059125964010283, "step": 3094, "train/total_loss": 0.23655752837657928 }, { "entropy": 8.744091987609863, "epoch": 0.3060114692505438, "mean_token_accuracy": 0.7528344392776489, "num_tokens": 17009777.0, "step": 3095, "train/ce_loss": 0.460539311170578 }, { "epoch": 0.3060114692505438, "step": 3095, "train/sim_loss": 0.0546875 }, { "epoch": 0.3060114692505438, "step": 3095, "train/total_loss": 0.1007414311170578 }, { "entropy": 9.120452880859375, "epoch": 0.30611034210005933, "mean_token_accuracy": 0.7551928758621216, "num_tokens": 17015046.0, "step": 3096, "train/ce_loss": 0.5844276547431946 }, { "epoch": 0.30611034210005933, "step": 3096, "train/sim_loss": 0.02734375 }, { "epoch": 0.30611034210005933, "step": 3096, "train/total_loss": 0.08578652143478394 }, { "entropy": 8.870574951171875, "epoch": 0.30620921494957487, "mean_token_accuracy": 0.7411067485809326, "num_tokens": 17020644.0, "step": 3097, "train/ce_loss": 0.3833440840244293 }, { "epoch": 0.30620921494957487, "step": 3097, "train/sim_loss": 0.01953125 }, { "epoch": 0.30620921494957487, "step": 3097, "train/total_loss": 0.05786566063761711 }, { "entropy": 8.79910659790039, "epoch": 0.30630808779909036, "mean_token_accuracy": 0.699447512626648, "num_tokens": 17026163.0, "step": 3098, "train/ce_loss": 1.1933748722076416 }, { "epoch": 0.30630808779909036, "step": 3098, "train/sim_loss": 0.0390625 }, { "epoch": 0.30630808779909036, "step": 3098, "train/total_loss": 0.15839999914169312 }, { "entropy": 9.22780704498291, "epoch": 0.3064069606486059, "mean_token_accuracy": 0.6917562484741211, "num_tokens": 17031574.0, "step": 3099, "train/ce_loss": 0.9937363266944885 }, { "epoch": 0.3064069606486059, "step": 3099, "train/sim_loss": 0.06640625 }, { "epoch": 0.3064069606486059, "step": 3099, "train/total_loss": 0.16577988862991333 }, { "epoch": 0.30650583349812144, "grad_norm": 0.8800408840179443, "learning_rate": 9.236265638134798e-06, "loss": 0.1501, "step": 3100 }, { "entropy": 8.951703071594238, "epoch": 0.30650583349812144, "mean_token_accuracy": 0.7458233833312988, "num_tokens": 17037044.0, "step": 3100, "train/ce_loss": 0.4844367504119873 }, { "epoch": 0.30650583349812144, "step": 3100, "train/sim_loss": 0.0703125 }, { "epoch": 0.30650583349812144, "step": 3100, "train/total_loss": 0.11875617504119873 }, { "entropy": 9.086009979248047, "epoch": 0.3066047063476369, "mean_token_accuracy": 0.7695906162261963, "num_tokens": 17042489.0, "step": 3101, "train/ce_loss": 0.9228971600532532 }, { "epoch": 0.3066047063476369, "step": 3101, "train/sim_loss": 0.0234375 }, { "epoch": 0.3066047063476369, "step": 3101, "train/total_loss": 0.11572721600532532 }, { "entropy": 8.88589859008789, "epoch": 0.30670357919715246, "mean_token_accuracy": 0.7754868268966675, "num_tokens": 17047956.0, "step": 3102, "train/ce_loss": 0.5477607846260071 }, { "epoch": 0.30670357919715246, "step": 3102, "train/sim_loss": 0.03515625 }, { "epoch": 0.30670357919715246, "step": 3102, "train/total_loss": 0.08993232995271683 }, { "entropy": 8.986822128295898, "epoch": 0.306802452046668, "mean_token_accuracy": 0.8047722578048706, "num_tokens": 17053429.0, "step": 3103, "train/ce_loss": 0.420829713344574 }, { "epoch": 0.306802452046668, "step": 3103, "train/sim_loss": 0.09765625 }, { "epoch": 0.306802452046668, "step": 3103, "train/total_loss": 0.13973921537399292 }, { "entropy": 8.545660018920898, "epoch": 0.3069013248961835, "mean_token_accuracy": 0.7154340744018555, "num_tokens": 17059225.0, "step": 3104, "train/ce_loss": 0.39070507884025574 }, { "epoch": 0.3069013248961835, "step": 3104, "train/sim_loss": 0.03125 }, { "epoch": 0.3069013248961835, "step": 3104, "train/total_loss": 0.0703205093741417 }, { "entropy": 8.91838264465332, "epoch": 0.30700019774569903, "mean_token_accuracy": 0.7303754091262817, "num_tokens": 17064687.0, "step": 3105, "train/ce_loss": 0.488307923078537 }, { "epoch": 0.30700019774569903, "step": 3105, "train/sim_loss": 0.046875 }, { "epoch": 0.30700019774569903, "step": 3105, "train/total_loss": 0.0957057923078537 }, { "entropy": 8.782447814941406, "epoch": 0.3070990705952146, "mean_token_accuracy": 0.7258567214012146, "num_tokens": 17070279.0, "step": 3106, "train/ce_loss": 0.9980507493019104 }, { "epoch": 0.3070990705952146, "step": 3106, "train/sim_loss": 0.0625 }, { "epoch": 0.3070990705952146, "step": 3106, "train/total_loss": 0.16230508685112 }, { "entropy": 9.130406379699707, "epoch": 0.30719794344473006, "mean_token_accuracy": 0.7160963416099548, "num_tokens": 17075647.0, "step": 3107, "train/ce_loss": 0.45451632142066956 }, { "epoch": 0.30719794344473006, "step": 3107, "train/sim_loss": 0.07421875 }, { "epoch": 0.30719794344473006, "step": 3107, "train/total_loss": 0.11967038363218307 }, { "entropy": 8.825738906860352, "epoch": 0.3072968162942456, "mean_token_accuracy": 0.7285861968994141, "num_tokens": 17081148.0, "step": 3108, "train/ce_loss": 0.9467611908912659 }, { "epoch": 0.3072968162942456, "step": 3108, "train/sim_loss": 0.03125 }, { "epoch": 0.3072968162942456, "step": 3108, "train/total_loss": 0.12592612206935883 }, { "entropy": 8.907684326171875, "epoch": 0.30739568914376114, "mean_token_accuracy": 0.7032474875450134, "num_tokens": 17086673.0, "step": 3109, "train/ce_loss": 1.4115108251571655 }, { "epoch": 0.30739568914376114, "step": 3109, "train/sim_loss": 0.1171875 }, { "epoch": 0.30739568914376114, "step": 3109, "train/total_loss": 0.2583385705947876 }, { "entropy": 8.65169620513916, "epoch": 0.3074945619932766, "mean_token_accuracy": 0.6810035705566406, "num_tokens": 17092390.0, "step": 3110, "train/ce_loss": 1.2016295194625854 }, { "epoch": 0.3074945619932766, "step": 3110, "train/sim_loss": 0.0703125 }, { "epoch": 0.3074945619932766, "step": 3110, "train/total_loss": 0.1904754638671875 }, { "entropy": 8.826128005981445, "epoch": 0.30759343484279217, "mean_token_accuracy": 0.7510548233985901, "num_tokens": 17098034.0, "step": 3111, "train/ce_loss": 1.0890947580337524 }, { "epoch": 0.30759343484279217, "step": 3111, "train/sim_loss": 0.08984375 }, { "epoch": 0.30759343484279217, "step": 3111, "train/total_loss": 0.1987532377243042 }, { "entropy": 9.299858093261719, "epoch": 0.3076923076923077, "mean_token_accuracy": 0.7362318634986877, "num_tokens": 17103296.0, "step": 3112, "train/ce_loss": 0.9142554998397827 }, { "epoch": 0.3076923076923077, "step": 3112, "train/sim_loss": 0.125 }, { "epoch": 0.3076923076923077, "step": 3112, "train/total_loss": 0.2164255529642105 }, { "entropy": 8.98437786102295, "epoch": 0.3077911805418232, "mean_token_accuracy": 0.7710843086242676, "num_tokens": 17108785.0, "step": 3113, "train/ce_loss": 1.3922609090805054 }, { "epoch": 0.3077911805418232, "step": 3113, "train/sim_loss": 0.1640625 }, { "epoch": 0.3077911805418232, "step": 3113, "train/total_loss": 0.3032885789871216 }, { "entropy": 9.23414134979248, "epoch": 0.30789005339133874, "mean_token_accuracy": 0.7583444714546204, "num_tokens": 17114170.0, "step": 3114, "train/ce_loss": 0.8767290115356445 }, { "epoch": 0.30789005339133874, "step": 3114, "train/sim_loss": 0.0546875 }, { "epoch": 0.30789005339133874, "step": 3114, "train/total_loss": 0.1423604041337967 }, { "entropy": 9.193721771240234, "epoch": 0.3079889262408543, "mean_token_accuracy": 0.7808988690376282, "num_tokens": 17119467.0, "step": 3115, "train/ce_loss": 0.6617518067359924 }, { "epoch": 0.3079889262408543, "step": 3115, "train/sim_loss": 0.08203125 }, { "epoch": 0.3079889262408543, "step": 3115, "train/total_loss": 0.1482064425945282 }, { "entropy": 8.980582237243652, "epoch": 0.30808779909036976, "mean_token_accuracy": 0.6940509676933289, "num_tokens": 17124803.0, "step": 3116, "train/ce_loss": 1.248450756072998 }, { "epoch": 0.30808779909036976, "step": 3116, "train/sim_loss": 0.11328125 }, { "epoch": 0.30808779909036976, "step": 3116, "train/total_loss": 0.23812633752822876 }, { "entropy": 8.955608367919922, "epoch": 0.3081866719398853, "mean_token_accuracy": 0.7293729186058044, "num_tokens": 17130320.0, "step": 3117, "train/ce_loss": 1.403515338897705 }, { "epoch": 0.3081866719398853, "step": 3117, "train/sim_loss": 0.125 }, { "epoch": 0.3081866719398853, "step": 3117, "train/total_loss": 0.2653515338897705 }, { "entropy": 9.243014335632324, "epoch": 0.30828554478940084, "mean_token_accuracy": 0.7738562226295471, "num_tokens": 17135565.0, "step": 3118, "train/ce_loss": 0.700927734375 }, { "epoch": 0.30828554478940084, "step": 3118, "train/sim_loss": 0.046875 }, { "epoch": 0.30828554478940084, "step": 3118, "train/total_loss": 0.11696777492761612 }, { "entropy": 8.892450332641602, "epoch": 0.30838441763891633, "mean_token_accuracy": 0.7309812307357788, "num_tokens": 17141126.0, "step": 3119, "train/ce_loss": 0.8353440165519714 }, { "epoch": 0.30838441763891633, "step": 3119, "train/sim_loss": 0.1640625 }, { "epoch": 0.30838441763891633, "step": 3119, "train/total_loss": 0.24759690463542938 }, { "epoch": 0.30848329048843187, "grad_norm": 0.963024914264679, "learning_rate": 9.231320773376848e-06, "loss": 0.1505, "step": 3120 }, { "entropy": 8.705280303955078, "epoch": 0.30848329048843187, "mean_token_accuracy": 0.7357211709022522, "num_tokens": 17146847.0, "step": 3120, "train/ce_loss": 0.35779300332069397 }, { "epoch": 0.30848329048843187, "step": 3120, "train/sim_loss": 0.015625 }, { "epoch": 0.30848329048843187, "step": 3120, "train/total_loss": 0.05140430107712746 }, { "entropy": 8.872904777526855, "epoch": 0.3085821633379474, "mean_token_accuracy": 0.7420718669891357, "num_tokens": 17152360.0, "step": 3121, "train/ce_loss": 0.751602292060852 }, { "epoch": 0.3085821633379474, "step": 3121, "train/sim_loss": 0.05078125 }, { "epoch": 0.3085821633379474, "step": 3121, "train/total_loss": 0.12594148516654968 }, { "entropy": 8.857254028320312, "epoch": 0.3086810361874629, "mean_token_accuracy": 0.7903929948806763, "num_tokens": 17157871.0, "step": 3122, "train/ce_loss": 0.5299984216690063 }, { "epoch": 0.3086810361874629, "step": 3122, "train/sim_loss": 0.04296875 }, { "epoch": 0.3086810361874629, "step": 3122, "train/total_loss": 0.0959685891866684 }, { "entropy": 8.928861618041992, "epoch": 0.30877990903697844, "mean_token_accuracy": 0.6556097269058228, "num_tokens": 17163495.0, "step": 3123, "train/ce_loss": 1.8806591033935547 }, { "epoch": 0.30877990903697844, "step": 3123, "train/sim_loss": 0.15234375 }, { "epoch": 0.30877990903697844, "step": 3123, "train/total_loss": 0.34040966629981995 }, { "entropy": 8.977429389953613, "epoch": 0.308878781886494, "mean_token_accuracy": 0.7293689250946045, "num_tokens": 17168957.0, "step": 3124, "train/ce_loss": 0.9548178315162659 }, { "epoch": 0.308878781886494, "step": 3124, "train/sim_loss": 0.09375 }, { "epoch": 0.308878781886494, "step": 3124, "train/total_loss": 0.1892317831516266 }, { "entropy": 9.18392562866211, "epoch": 0.30897765473600947, "mean_token_accuracy": 0.7439024448394775, "num_tokens": 17174411.0, "step": 3125, "train/ce_loss": 0.7882577180862427 }, { "epoch": 0.30897765473600947, "step": 3125, "train/sim_loss": 0.0625 }, { "epoch": 0.30897765473600947, "step": 3125, "train/total_loss": 0.14132577180862427 }, { "entropy": 8.949541091918945, "epoch": 0.309076527585525, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 17179903.0, "step": 3126, "train/ce_loss": 0.8312627673149109 }, { "epoch": 0.309076527585525, "step": 3126, "train/sim_loss": 0.03515625 }, { "epoch": 0.309076527585525, "step": 3126, "train/total_loss": 0.11828252673149109 }, { "entropy": 9.145468711853027, "epoch": 0.30917540043504055, "mean_token_accuracy": 0.7801339030265808, "num_tokens": 17185253.0, "step": 3127, "train/ce_loss": 0.7484632730484009 }, { "epoch": 0.30917540043504055, "step": 3127, "train/sim_loss": 0.08203125 }, { "epoch": 0.30917540043504055, "step": 3127, "train/total_loss": 0.1568775773048401 }, { "entropy": 9.03184700012207, "epoch": 0.30927427328455603, "mean_token_accuracy": 0.6859099864959717, "num_tokens": 17190929.0, "step": 3128, "train/ce_loss": 0.8835063576698303 }, { "epoch": 0.30927427328455603, "step": 3128, "train/sim_loss": 0.09375 }, { "epoch": 0.30927427328455603, "step": 3128, "train/total_loss": 0.18210063874721527 }, { "entropy": 9.259134292602539, "epoch": 0.3093731461340716, "mean_token_accuracy": 0.7669345736503601, "num_tokens": 17196291.0, "step": 3129, "train/ce_loss": 0.868071973323822 }, { "epoch": 0.3093731461340716, "step": 3129, "train/sim_loss": 0.0859375 }, { "epoch": 0.3093731461340716, "step": 3129, "train/total_loss": 0.17274469137191772 }, { "entropy": 9.331928253173828, "epoch": 0.3094720189835871, "mean_token_accuracy": 0.704011082649231, "num_tokens": 17201609.0, "step": 3130, "train/ce_loss": 0.5285792946815491 }, { "epoch": 0.3094720189835871, "step": 3130, "train/sim_loss": 0.02734375 }, { "epoch": 0.3094720189835871, "step": 3130, "train/total_loss": 0.08020168542861938 }, { "entropy": 9.167109489440918, "epoch": 0.30957089183310266, "mean_token_accuracy": 0.7324766516685486, "num_tokens": 17207107.0, "step": 3131, "train/ce_loss": 0.7970251441001892 }, { "epoch": 0.30957089183310266, "step": 3131, "train/sim_loss": 0.05859375 }, { "epoch": 0.30957089183310266, "step": 3131, "train/total_loss": 0.13829627633094788 }, { "entropy": 8.935436248779297, "epoch": 0.30966976468261814, "mean_token_accuracy": 0.7690355181694031, "num_tokens": 17212621.0, "step": 3132, "train/ce_loss": 0.7534241676330566 }, { "epoch": 0.30966976468261814, "step": 3132, "train/sim_loss": 0.05859375 }, { "epoch": 0.30966976468261814, "step": 3132, "train/total_loss": 0.13393616676330566 }, { "entropy": 9.069364547729492, "epoch": 0.3097686375321337, "mean_token_accuracy": 0.7121587991714478, "num_tokens": 17218096.0, "step": 3133, "train/ce_loss": 1.6351535320281982 }, { "epoch": 0.3097686375321337, "step": 3133, "train/sim_loss": 0.07421875 }, { "epoch": 0.3097686375321337, "step": 3133, "train/total_loss": 0.2377341091632843 }, { "entropy": 9.16293716430664, "epoch": 0.3098675103816492, "mean_token_accuracy": 0.7933491468429565, "num_tokens": 17223574.0, "step": 3134, "train/ce_loss": 0.6211536526679993 }, { "epoch": 0.3098675103816492, "step": 3134, "train/sim_loss": 0.02734375 }, { "epoch": 0.3098675103816492, "step": 3134, "train/total_loss": 0.0894591212272644 }, { "entropy": 8.940603256225586, "epoch": 0.3099663832311647, "mean_token_accuracy": 0.7703889608383179, "num_tokens": 17229000.0, "step": 3135, "train/ce_loss": 0.5067156553268433 }, { "epoch": 0.3099663832311647, "step": 3135, "train/sim_loss": 0.046875 }, { "epoch": 0.3099663832311647, "step": 3135, "train/total_loss": 0.09754656255245209 }, { "entropy": 9.105433464050293, "epoch": 0.31006525608068025, "mean_token_accuracy": 0.6639816164970398, "num_tokens": 17234539.0, "step": 3136, "train/ce_loss": 1.3524357080459595 }, { "epoch": 0.31006525608068025, "step": 3136, "train/sim_loss": 0.078125 }, { "epoch": 0.31006525608068025, "step": 3136, "train/total_loss": 0.21336857974529266 }, { "entropy": 8.546010971069336, "epoch": 0.3101641289301958, "mean_token_accuracy": 0.7605381011962891, "num_tokens": 17240287.0, "step": 3137, "train/ce_loss": 0.6862257719039917 }, { "epoch": 0.3101641289301958, "step": 3137, "train/sim_loss": 0.0234375 }, { "epoch": 0.3101641289301958, "step": 3137, "train/total_loss": 0.09206008166074753 }, { "entropy": 9.144416809082031, "epoch": 0.3102630017797113, "mean_token_accuracy": 0.6830065250396729, "num_tokens": 17245742.0, "step": 3138, "train/ce_loss": 1.068015456199646 }, { "epoch": 0.3102630017797113, "step": 3138, "train/sim_loss": 0.0625 }, { "epoch": 0.3102630017797113, "step": 3138, "train/total_loss": 0.16930153965950012 }, { "entropy": 9.020668029785156, "epoch": 0.3103618746292268, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 17251282.0, "step": 3139, "train/ce_loss": 0.7186133861541748 }, { "epoch": 0.3103618746292268, "step": 3139, "train/sim_loss": 0.05859375 }, { "epoch": 0.3103618746292268, "step": 3139, "train/total_loss": 0.13045509159564972 }, { "epoch": 0.31046074747874236, "grad_norm": 0.7492353320121765, "learning_rate": 9.2263759086189e-06, "loss": 0.1448, "step": 3140 }, { "entropy": 9.279489517211914, "epoch": 0.31046074747874236, "mean_token_accuracy": 0.767507016658783, "num_tokens": 17256596.0, "step": 3140, "train/ce_loss": 0.40935754776000977 }, { "epoch": 0.31046074747874236, "step": 3140, "train/sim_loss": 0.10546875 }, { "epoch": 0.31046074747874236, "step": 3140, "train/total_loss": 0.14640450477600098 }, { "entropy": 8.980892181396484, "epoch": 0.31055962032825785, "mean_token_accuracy": 0.8035516142845154, "num_tokens": 17262065.0, "step": 3141, "train/ce_loss": 0.7863887548446655 }, { "epoch": 0.31055962032825785, "step": 3141, "train/sim_loss": 0.0625 }, { "epoch": 0.31055962032825785, "step": 3141, "train/total_loss": 0.14113888144493103 }, { "entropy": 8.939764022827148, "epoch": 0.3106584931777734, "mean_token_accuracy": 0.7299435138702393, "num_tokens": 17267512.0, "step": 3142, "train/ce_loss": 0.8680033683776855 }, { "epoch": 0.3106584931777734, "step": 3142, "train/sim_loss": 0.1015625 }, { "epoch": 0.3106584931777734, "step": 3142, "train/total_loss": 0.18836283683776855 }, { "entropy": 9.313264846801758, "epoch": 0.31075736602728893, "mean_token_accuracy": 0.7719546556472778, "num_tokens": 17272839.0, "step": 3143, "train/ce_loss": 0.5376891493797302 }, { "epoch": 0.31075736602728893, "step": 3143, "train/sim_loss": 0.078125 }, { "epoch": 0.31075736602728893, "step": 3143, "train/total_loss": 0.13189391791820526 }, { "entropy": 9.122360229492188, "epoch": 0.3108562388768044, "mean_token_accuracy": 0.7485029697418213, "num_tokens": 17278240.0, "step": 3144, "train/ce_loss": 0.8106685876846313 }, { "epoch": 0.3108562388768044, "step": 3144, "train/sim_loss": 0.05078125 }, { "epoch": 0.3108562388768044, "step": 3144, "train/total_loss": 0.13184811174869537 }, { "entropy": 8.980488777160645, "epoch": 0.31095511172631995, "mean_token_accuracy": 0.7672209143638611, "num_tokens": 17283734.0, "step": 3145, "train/ce_loss": 0.5040611028671265 }, { "epoch": 0.31095511172631995, "step": 3145, "train/sim_loss": 0.05859375 }, { "epoch": 0.31095511172631995, "step": 3145, "train/total_loss": 0.10899986326694489 }, { "entropy": 9.184673309326172, "epoch": 0.3110539845758355, "mean_token_accuracy": 0.75, "num_tokens": 17289083.0, "step": 3146, "train/ce_loss": 0.9875854253768921 }, { "epoch": 0.3110539845758355, "step": 3146, "train/sim_loss": 0.046875 }, { "epoch": 0.3110539845758355, "step": 3146, "train/total_loss": 0.1456335484981537 }, { "entropy": 9.07229995727539, "epoch": 0.311152857425351, "mean_token_accuracy": 0.7618534564971924, "num_tokens": 17294621.0, "step": 3147, "train/ce_loss": 0.5323328971862793 }, { "epoch": 0.311152857425351, "step": 3147, "train/sim_loss": 0.05078125 }, { "epoch": 0.311152857425351, "step": 3147, "train/total_loss": 0.10401454567909241 }, { "entropy": 8.674625396728516, "epoch": 0.3112517302748665, "mean_token_accuracy": 0.7110530734062195, "num_tokens": 17300387.0, "step": 3148, "train/ce_loss": 2.1750056743621826 }, { "epoch": 0.3112517302748665, "step": 3148, "train/sim_loss": 0.14453125 }, { "epoch": 0.3112517302748665, "step": 3148, "train/total_loss": 0.36203181743621826 }, { "entropy": 9.418913841247559, "epoch": 0.31135060312438206, "mean_token_accuracy": 0.6930692791938782, "num_tokens": 17305714.0, "step": 3149, "train/ce_loss": 1.2313957214355469 }, { "epoch": 0.31135060312438206, "step": 3149, "train/sim_loss": 0.09375 }, { "epoch": 0.31135060312438206, "step": 3149, "train/total_loss": 0.21688957512378693 }, { "entropy": 8.98388671875, "epoch": 0.31144947597389755, "mean_token_accuracy": 0.7195402383804321, "num_tokens": 17311172.0, "step": 3150, "train/ce_loss": 1.1076221466064453 }, { "epoch": 0.31144947597389755, "step": 3150, "train/sim_loss": 0.12109375 }, { "epoch": 0.31144947597389755, "step": 3150, "train/total_loss": 0.23185595870018005 }, { "entropy": 8.978836059570312, "epoch": 0.3115483488234131, "mean_token_accuracy": 0.7687723636627197, "num_tokens": 17316604.0, "step": 3151, "train/ce_loss": 0.5782110095024109 }, { "epoch": 0.3115483488234131, "step": 3151, "train/sim_loss": 0.109375 }, { "epoch": 0.3115483488234131, "step": 3151, "train/total_loss": 0.1671960949897766 }, { "entropy": 9.230999946594238, "epoch": 0.31164722167292863, "mean_token_accuracy": 0.765544056892395, "num_tokens": 17322025.0, "step": 3152, "train/ce_loss": 0.896418035030365 }, { "epoch": 0.31164722167292863, "step": 3152, "train/sim_loss": 0.1015625 }, { "epoch": 0.31164722167292863, "step": 3152, "train/total_loss": 0.19120430946350098 }, { "entropy": 9.20841121673584, "epoch": 0.3117460945224441, "mean_token_accuracy": 0.7072192430496216, "num_tokens": 17327374.0, "step": 3153, "train/ce_loss": 0.9731608629226685 }, { "epoch": 0.3117460945224441, "step": 3153, "train/sim_loss": 0.0859375 }, { "epoch": 0.3117460945224441, "step": 3153, "train/total_loss": 0.18325358629226685 }, { "entropy": 8.557950973510742, "epoch": 0.31184496737195966, "mean_token_accuracy": 0.7399665713310242, "num_tokens": 17333183.0, "step": 3154, "train/ce_loss": 0.45261967182159424 }, { "epoch": 0.31184496737195966, "step": 3154, "train/sim_loss": 0.03125 }, { "epoch": 0.31184496737195966, "step": 3154, "train/total_loss": 0.07651196420192719 }, { "entropy": 8.792404174804688, "epoch": 0.3119438402214752, "mean_token_accuracy": 0.746666669845581, "num_tokens": 17338634.0, "step": 3155, "train/ce_loss": 0.852712869644165 }, { "epoch": 0.3119438402214752, "step": 3155, "train/sim_loss": 0.06640625 }, { "epoch": 0.3119438402214752, "step": 3155, "train/total_loss": 0.15167754888534546 }, { "entropy": 9.127447128295898, "epoch": 0.3120427130709907, "mean_token_accuracy": 0.7412755489349365, "num_tokens": 17344077.0, "step": 3156, "train/ce_loss": 0.6051557660102844 }, { "epoch": 0.3120427130709907, "step": 3156, "train/sim_loss": 0.07421875 }, { "epoch": 0.3120427130709907, "step": 3156, "train/total_loss": 0.13473433256149292 }, { "entropy": 8.888646125793457, "epoch": 0.3121415859205062, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 17349707.0, "step": 3157, "train/ce_loss": 0.8284165263175964 }, { "epoch": 0.3121415859205062, "step": 3157, "train/sim_loss": 0.0859375 }, { "epoch": 0.3121415859205062, "step": 3157, "train/total_loss": 0.1687791645526886 }, { "entropy": 9.178301811218262, "epoch": 0.31224045877002177, "mean_token_accuracy": 0.7483870983123779, "num_tokens": 17355063.0, "step": 3158, "train/ce_loss": 0.5793125629425049 }, { "epoch": 0.31224045877002177, "step": 3158, "train/sim_loss": 0.03515625 }, { "epoch": 0.31224045877002177, "step": 3158, "train/total_loss": 0.09308750927448273 }, { "entropy": 9.344057083129883, "epoch": 0.31233933161953725, "mean_token_accuracy": 0.6976743936538696, "num_tokens": 17360497.0, "step": 3159, "train/ce_loss": 0.6860307455062866 }, { "epoch": 0.31233933161953725, "step": 3159, "train/sim_loss": 0.109375 }, { "epoch": 0.31233933161953725, "step": 3159, "train/total_loss": 0.17797806859016418 }, { "epoch": 0.3124382044690528, "grad_norm": 0.8728207349777222, "learning_rate": 9.221431043860951e-06, "loss": 0.149, "step": 3160 }, { "entropy": 8.548235893249512, "epoch": 0.3124382044690528, "mean_token_accuracy": 0.7092360258102417, "num_tokens": 17366031.0, "step": 3160, "train/ce_loss": 0.6621466875076294 }, { "epoch": 0.3124382044690528, "step": 3160, "train/sim_loss": 0.08203125 }, { "epoch": 0.3124382044690528, "step": 3160, "train/total_loss": 0.1482459306716919 }, { "entropy": 9.125000953674316, "epoch": 0.31253707731856833, "mean_token_accuracy": 0.7235079407691956, "num_tokens": 17371479.0, "step": 3161, "train/ce_loss": 1.0628327131271362 }, { "epoch": 0.31253707731856833, "step": 3161, "train/sim_loss": 0.0546875 }, { "epoch": 0.31253707731856833, "step": 3161, "train/total_loss": 0.1609707772731781 }, { "entropy": 8.73045825958252, "epoch": 0.3126359501680838, "mean_token_accuracy": 0.7545164823532104, "num_tokens": 17377037.0, "step": 3162, "train/ce_loss": 0.7985665798187256 }, { "epoch": 0.3126359501680838, "step": 3162, "train/sim_loss": 0.08203125 }, { "epoch": 0.3126359501680838, "step": 3162, "train/total_loss": 0.16188791394233704 }, { "entropy": 9.180198669433594, "epoch": 0.31273482301759936, "mean_token_accuracy": 0.7310011982917786, "num_tokens": 17382438.0, "step": 3163, "train/ce_loss": 1.0672281980514526 }, { "epoch": 0.31273482301759936, "step": 3163, "train/sim_loss": 0.109375 }, { "epoch": 0.31273482301759936, "step": 3163, "train/total_loss": 0.21609783172607422 }, { "entropy": 9.045600891113281, "epoch": 0.3128336958671149, "mean_token_accuracy": 0.7766554355621338, "num_tokens": 17387981.0, "step": 3164, "train/ce_loss": 0.821208655834198 }, { "epoch": 0.3128336958671149, "step": 3164, "train/sim_loss": 0.0390625 }, { "epoch": 0.3128336958671149, "step": 3164, "train/total_loss": 0.1211833655834198 }, { "entropy": 8.827220916748047, "epoch": 0.3129325687166304, "mean_token_accuracy": 0.7671673893928528, "num_tokens": 17393574.0, "step": 3165, "train/ce_loss": 0.7001433372497559 }, { "epoch": 0.3129325687166304, "step": 3165, "train/sim_loss": 0.05078125 }, { "epoch": 0.3129325687166304, "step": 3165, "train/total_loss": 0.1207955852150917 }, { "entropy": 8.814104080200195, "epoch": 0.31303144156614593, "mean_token_accuracy": 0.8300721049308777, "num_tokens": 17399150.0, "step": 3166, "train/ce_loss": 0.4449072480201721 }, { "epoch": 0.31303144156614593, "step": 3166, "train/sim_loss": 0.0234375 }, { "epoch": 0.31303144156614593, "step": 3166, "train/total_loss": 0.06792822480201721 }, { "entropy": 9.231389999389648, "epoch": 0.31313031441566147, "mean_token_accuracy": 0.7190476059913635, "num_tokens": 17404544.0, "step": 3167, "train/ce_loss": 0.9643074870109558 }, { "epoch": 0.31313031441566147, "step": 3167, "train/sim_loss": 0.06640625 }, { "epoch": 0.31313031441566147, "step": 3167, "train/total_loss": 0.16283699870109558 }, { "entropy": 9.303369522094727, "epoch": 0.31322918726517696, "mean_token_accuracy": 0.755047082901001, "num_tokens": 17409834.0, "step": 3168, "train/ce_loss": 0.4168268144130707 }, { "epoch": 0.31322918726517696, "step": 3168, "train/sim_loss": 0.0703125 }, { "epoch": 0.31322918726517696, "step": 3168, "train/total_loss": 0.11199518293142319 }, { "entropy": 9.49625301361084, "epoch": 0.3133280601146925, "mean_token_accuracy": 0.6973886489868164, "num_tokens": 17415108.0, "step": 3169, "train/ce_loss": 0.9372715950012207 }, { "epoch": 0.3133280601146925, "step": 3169, "train/sim_loss": 0.1015625 }, { "epoch": 0.3133280601146925, "step": 3169, "train/total_loss": 0.19528967142105103 }, { "entropy": 9.16253662109375, "epoch": 0.31342693296420804, "mean_token_accuracy": 0.7413333058357239, "num_tokens": 17420489.0, "step": 3170, "train/ce_loss": 0.728691577911377 }, { "epoch": 0.31342693296420804, "step": 3170, "train/sim_loss": 0.078125 }, { "epoch": 0.31342693296420804, "step": 3170, "train/total_loss": 0.15099415183067322 }, { "entropy": 9.357658386230469, "epoch": 0.3135258058137236, "mean_token_accuracy": 0.7152588367462158, "num_tokens": 17425821.0, "step": 3171, "train/ce_loss": 0.4957939684391022 }, { "epoch": 0.3135258058137236, "step": 3171, "train/sim_loss": 0.06640625 }, { "epoch": 0.3135258058137236, "step": 3171, "train/total_loss": 0.11598564684391022 }, { "entropy": 9.043060302734375, "epoch": 0.31362467866323906, "mean_token_accuracy": 0.7877697944641113, "num_tokens": 17431329.0, "step": 3172, "train/ce_loss": 0.7710519433021545 }, { "epoch": 0.31362467866323906, "step": 3172, "train/sim_loss": 0.07421875 }, { "epoch": 0.31362467866323906, "step": 3172, "train/total_loss": 0.15132394433021545 }, { "entropy": 9.028130531311035, "epoch": 0.3137235515127546, "mean_token_accuracy": 0.7487623691558838, "num_tokens": 17436767.0, "step": 3173, "train/ce_loss": 0.5524727702140808 }, { "epoch": 0.3137235515127546, "step": 3173, "train/sim_loss": 0.046875 }, { "epoch": 0.3137235515127546, "step": 3173, "train/total_loss": 0.10212227702140808 }, { "entropy": 8.986757278442383, "epoch": 0.31382242436227015, "mean_token_accuracy": 0.7190426588058472, "num_tokens": 17442315.0, "step": 3174, "train/ce_loss": 0.820647656917572 }, { "epoch": 0.31382242436227015, "step": 3174, "train/sim_loss": 0.11328125 }, { "epoch": 0.31382242436227015, "step": 3174, "train/total_loss": 0.19534602761268616 }, { "entropy": 8.929916381835938, "epoch": 0.31392129721178563, "mean_token_accuracy": 0.7230215668678284, "num_tokens": 17447788.0, "step": 3175, "train/ce_loss": 0.6961028575897217 }, { "epoch": 0.31392129721178563, "step": 3175, "train/sim_loss": 0.0234375 }, { "epoch": 0.31392129721178563, "step": 3175, "train/total_loss": 0.09304779022932053 }, { "entropy": 8.71533489227295, "epoch": 0.3140201700613012, "mean_token_accuracy": 0.7061688303947449, "num_tokens": 17453587.0, "step": 3176, "train/ce_loss": 0.40202653408050537 }, { "epoch": 0.3140201700613012, "step": 3176, "train/sim_loss": 0.03125 }, { "epoch": 0.3140201700613012, "step": 3176, "train/total_loss": 0.07145265489816666 }, { "entropy": 9.053091049194336, "epoch": 0.3141190429108167, "mean_token_accuracy": 0.7477578520774841, "num_tokens": 17459092.0, "step": 3177, "train/ce_loss": 0.8237164616584778 }, { "epoch": 0.3141190429108167, "step": 3177, "train/sim_loss": 0.078125 }, { "epoch": 0.3141190429108167, "step": 3177, "train/total_loss": 0.16049665212631226 }, { "entropy": 8.903619766235352, "epoch": 0.3142179157603322, "mean_token_accuracy": 0.7410972118377686, "num_tokens": 17464760.0, "step": 3178, "train/ce_loss": 1.1513208150863647 }, { "epoch": 0.3142179157603322, "step": 3178, "train/sim_loss": 0.0859375 }, { "epoch": 0.3142179157603322, "step": 3178, "train/total_loss": 0.20106959342956543 }, { "entropy": 9.243215560913086, "epoch": 0.31431678860984774, "mean_token_accuracy": 0.7317365407943726, "num_tokens": 17470161.0, "step": 3179, "train/ce_loss": 0.5943939089775085 }, { "epoch": 0.31431678860984774, "step": 3179, "train/sim_loss": 0.0703125 }, { "epoch": 0.31431678860984774, "step": 3179, "train/total_loss": 0.12975189089775085 }, { "epoch": 0.3144156614593633, "grad_norm": 0.7169293165206909, "learning_rate": 9.216486179103003e-06, "loss": 0.1478, "step": 3180 }, { "entropy": 9.002706527709961, "epoch": 0.3144156614593633, "mean_token_accuracy": 0.7436440587043762, "num_tokens": 17475685.0, "step": 3180, "train/ce_loss": 1.2033751010894775 }, { "epoch": 0.3144156614593633, "step": 3180, "train/sim_loss": 0.1171875 }, { "epoch": 0.3144156614593633, "step": 3180, "train/total_loss": 0.23752501606941223 }, { "entropy": 8.913119316101074, "epoch": 0.31451453430887877, "mean_token_accuracy": 0.7593712210655212, "num_tokens": 17481178.0, "step": 3181, "train/ce_loss": 0.9862450361251831 }, { "epoch": 0.31451453430887877, "step": 3181, "train/sim_loss": 0.0546875 }, { "epoch": 0.31451453430887877, "step": 3181, "train/total_loss": 0.15331199765205383 }, { "entropy": 9.048397064208984, "epoch": 0.3146134071583943, "mean_token_accuracy": 0.7754892110824585, "num_tokens": 17486655.0, "step": 3182, "train/ce_loss": 0.5820977687835693 }, { "epoch": 0.3146134071583943, "step": 3182, "train/sim_loss": 0.046875 }, { "epoch": 0.3146134071583943, "step": 3182, "train/total_loss": 0.10508477687835693 }, { "entropy": 9.09281063079834, "epoch": 0.31471228000790985, "mean_token_accuracy": 0.7419738173484802, "num_tokens": 17492144.0, "step": 3183, "train/ce_loss": 0.7413637042045593 }, { "epoch": 0.31471228000790985, "step": 3183, "train/sim_loss": 0.12890625 }, { "epoch": 0.31471228000790985, "step": 3183, "train/total_loss": 0.2030426263809204 }, { "entropy": 9.299104690551758, "epoch": 0.31481115285742534, "mean_token_accuracy": 0.7350901365280151, "num_tokens": 17497491.0, "step": 3184, "train/ce_loss": 0.5345300436019897 }, { "epoch": 0.31481115285742534, "step": 3184, "train/sim_loss": 0.06640625 }, { "epoch": 0.31481115285742534, "step": 3184, "train/total_loss": 0.1198592558503151 }, { "entropy": 8.960628509521484, "epoch": 0.3149100257069409, "mean_token_accuracy": 0.7990654110908508, "num_tokens": 17502949.0, "step": 3185, "train/ce_loss": 0.6483533382415771 }, { "epoch": 0.3149100257069409, "step": 3185, "train/sim_loss": 0.01953125 }, { "epoch": 0.3149100257069409, "step": 3185, "train/total_loss": 0.0843665823340416 }, { "entropy": 9.06187629699707, "epoch": 0.3150088985564564, "mean_token_accuracy": 0.6951807141304016, "num_tokens": 17508422.0, "step": 3186, "train/ce_loss": 0.7096522450447083 }, { "epoch": 0.3150088985564564, "step": 3186, "train/sim_loss": 0.11328125 }, { "epoch": 0.3150088985564564, "step": 3186, "train/total_loss": 0.1842464804649353 }, { "entropy": 8.703865051269531, "epoch": 0.3151077714059719, "mean_token_accuracy": 0.7726337313652039, "num_tokens": 17514044.0, "step": 3187, "train/ce_loss": 0.6518259644508362 }, { "epoch": 0.3151077714059719, "step": 3187, "train/sim_loss": 0.03125 }, { "epoch": 0.3151077714059719, "step": 3187, "train/total_loss": 0.09643259644508362 }, { "entropy": 9.039073944091797, "epoch": 0.31520664425548744, "mean_token_accuracy": 0.7583547830581665, "num_tokens": 17519477.0, "step": 3188, "train/ce_loss": 0.8648772835731506 }, { "epoch": 0.31520664425548744, "step": 3188, "train/sim_loss": 0.05859375 }, { "epoch": 0.31520664425548744, "step": 3188, "train/total_loss": 0.14508149027824402 }, { "entropy": 8.748784065246582, "epoch": 0.315305517105003, "mean_token_accuracy": 0.727668821811676, "num_tokens": 17525012.0, "step": 3189, "train/ce_loss": 1.0246641635894775 }, { "epoch": 0.315305517105003, "step": 3189, "train/sim_loss": 0.078125 }, { "epoch": 0.315305517105003, "step": 3189, "train/total_loss": 0.18059141933918 }, { "entropy": 9.041132926940918, "epoch": 0.31540438995451847, "mean_token_accuracy": 0.718262791633606, "num_tokens": 17530548.0, "step": 3190, "train/ce_loss": 1.6711208820343018 }, { "epoch": 0.31540438995451847, "step": 3190, "train/sim_loss": 0.10546875 }, { "epoch": 0.31540438995451847, "step": 3190, "train/total_loss": 0.2725808620452881 }, { "entropy": 9.251371383666992, "epoch": 0.315503262804034, "mean_token_accuracy": 0.7448717951774597, "num_tokens": 17536316.0, "step": 3191, "train/ce_loss": 0.4844897985458374 }, { "epoch": 0.315503262804034, "step": 3191, "train/sim_loss": 0.03125 }, { "epoch": 0.315503262804034, "step": 3191, "train/total_loss": 0.07969897985458374 }, { "entropy": 9.028661727905273, "epoch": 0.31560213565354955, "mean_token_accuracy": 0.6888633966445923, "num_tokens": 17541792.0, "step": 3192, "train/ce_loss": 0.5157886743545532 }, { "epoch": 0.31560213565354955, "step": 3192, "train/sim_loss": 0.0390625 }, { "epoch": 0.31560213565354955, "step": 3192, "train/total_loss": 0.09064136445522308 }, { "entropy": 9.14578914642334, "epoch": 0.31570100850306504, "mean_token_accuracy": 0.7199504375457764, "num_tokens": 17547199.0, "step": 3193, "train/ce_loss": 0.6761664152145386 }, { "epoch": 0.31570100850306504, "step": 3193, "train/sim_loss": 0.10546875 }, { "epoch": 0.31570100850306504, "step": 3193, "train/total_loss": 0.17308539152145386 }, { "entropy": 9.52097225189209, "epoch": 0.3157998813525806, "mean_token_accuracy": 0.7486534714698792, "num_tokens": 17552382.0, "step": 3194, "train/ce_loss": 0.45085641741752625 }, { "epoch": 0.3157998813525806, "step": 3194, "train/sim_loss": 0.05859375 }, { "epoch": 0.3157998813525806, "step": 3194, "train/total_loss": 0.10367938876152039 }, { "entropy": 8.85772705078125, "epoch": 0.3158987542020961, "mean_token_accuracy": 0.7762646079063416, "num_tokens": 17558079.0, "step": 3195, "train/ce_loss": 0.8605527281761169 }, { "epoch": 0.3158987542020961, "step": 3195, "train/sim_loss": 0.0625 }, { "epoch": 0.3158987542020961, "step": 3195, "train/total_loss": 0.14855527877807617 }, { "entropy": 9.275160789489746, "epoch": 0.3159976270516116, "mean_token_accuracy": 0.7458704113960266, "num_tokens": 17563449.0, "step": 3196, "train/ce_loss": 0.6484063863754272 }, { "epoch": 0.3159976270516116, "step": 3196, "train/sim_loss": 0.078125 }, { "epoch": 0.3159976270516116, "step": 3196, "train/total_loss": 0.1429656445980072 }, { "entropy": 9.0556058883667, "epoch": 0.31609649990112715, "mean_token_accuracy": 0.7252873778343201, "num_tokens": 17568975.0, "step": 3197, "train/ce_loss": 0.8787002563476562 }, { "epoch": 0.31609649990112715, "step": 3197, "train/sim_loss": 0.05078125 }, { "epoch": 0.31609649990112715, "step": 3197, "train/total_loss": 0.1386512815952301 }, { "entropy": 9.133522987365723, "epoch": 0.3161953727506427, "mean_token_accuracy": 0.7207943797111511, "num_tokens": 17574480.0, "step": 3198, "train/ce_loss": 0.9527670741081238 }, { "epoch": 0.3161953727506427, "step": 3198, "train/sim_loss": 0.05078125 }, { "epoch": 0.3161953727506427, "step": 3198, "train/total_loss": 0.14605796337127686 }, { "entropy": 9.109245300292969, "epoch": 0.3162942456001582, "mean_token_accuracy": 0.7425083518028259, "num_tokens": 17579956.0, "step": 3199, "train/ce_loss": 0.549281120300293 }, { "epoch": 0.3162942456001582, "step": 3199, "train/sim_loss": 0.04296875 }, { "epoch": 0.3162942456001582, "step": 3199, "train/total_loss": 0.09789685904979706 }, { "epoch": 0.3163931184496737, "grad_norm": 0.7122979760169983, "learning_rate": 9.211541314345054e-06, "loss": 0.149, "step": 3200 }, { "entropy": 9.142839431762695, "epoch": 0.3163931184496737, "mean_token_accuracy": 0.7045143842697144, "num_tokens": 17585323.0, "step": 3200, "train/ce_loss": 0.7413316965103149 }, { "epoch": 0.3163931184496737, "step": 3200, "train/sim_loss": 0.046875 }, { "epoch": 0.3163931184496737, "step": 3200, "train/total_loss": 0.12100817263126373 }, { "entropy": 9.179192543029785, "epoch": 0.31649199129918926, "mean_token_accuracy": 0.7620967626571655, "num_tokens": 17590620.0, "step": 3201, "train/ce_loss": 0.7238313555717468 }, { "epoch": 0.31649199129918926, "step": 3201, "train/sim_loss": 0.04296875 }, { "epoch": 0.31649199129918926, "step": 3201, "train/total_loss": 0.11535188555717468 }, { "entropy": 8.749460220336914, "epoch": 0.31659086414870474, "mean_token_accuracy": 0.718199610710144, "num_tokens": 17596217.0, "step": 3202, "train/ce_loss": 1.2172656059265137 }, { "epoch": 0.31659086414870474, "step": 3202, "train/sim_loss": 0.08984375 }, { "epoch": 0.31659086414870474, "step": 3202, "train/total_loss": 0.21157032251358032 }, { "entropy": 9.155179977416992, "epoch": 0.3166897369982203, "mean_token_accuracy": 0.7331786751747131, "num_tokens": 17601735.0, "step": 3203, "train/ce_loss": 0.925238311290741 }, { "epoch": 0.3166897369982203, "step": 3203, "train/sim_loss": 0.08984375 }, { "epoch": 0.3166897369982203, "step": 3203, "train/total_loss": 0.18236759305000305 }, { "entropy": 8.73247241973877, "epoch": 0.3167886098477358, "mean_token_accuracy": 0.7622641324996948, "num_tokens": 17607446.0, "step": 3204, "train/ce_loss": 0.950086236000061 }, { "epoch": 0.3167886098477358, "step": 3204, "train/sim_loss": 0.0859375 }, { "epoch": 0.3167886098477358, "step": 3204, "train/total_loss": 0.18094612658023834 }, { "entropy": 9.006240844726562, "epoch": 0.3168874826972513, "mean_token_accuracy": 0.754291832447052, "num_tokens": 17612942.0, "step": 3205, "train/ce_loss": 0.8232473134994507 }, { "epoch": 0.3168874826972513, "step": 3205, "train/sim_loss": 0.0625 }, { "epoch": 0.3168874826972513, "step": 3205, "train/total_loss": 0.14482474327087402 }, { "entropy": 9.097203254699707, "epoch": 0.31698635554676685, "mean_token_accuracy": 0.7113401889801025, "num_tokens": 17618356.0, "step": 3206, "train/ce_loss": 0.6480202674865723 }, { "epoch": 0.31698635554676685, "step": 3206, "train/sim_loss": 0.04296875 }, { "epoch": 0.31698635554676685, "step": 3206, "train/total_loss": 0.10777077823877335 }, { "entropy": 9.291804313659668, "epoch": 0.3170852283962824, "mean_token_accuracy": 0.7487684488296509, "num_tokens": 17623822.0, "step": 3207, "train/ce_loss": 1.0712369680404663 }, { "epoch": 0.3170852283962824, "step": 3207, "train/sim_loss": 0.046875 }, { "epoch": 0.3170852283962824, "step": 3207, "train/total_loss": 0.1539987027645111 }, { "entropy": 9.112485885620117, "epoch": 0.3171841012457979, "mean_token_accuracy": 0.7116630673408508, "num_tokens": 17629334.0, "step": 3208, "train/ce_loss": 0.6987901926040649 }, { "epoch": 0.3171841012457979, "step": 3208, "train/sim_loss": 0.08203125 }, { "epoch": 0.3171841012457979, "step": 3208, "train/total_loss": 0.15191027522087097 }, { "entropy": 8.89693546295166, "epoch": 0.3172829740953134, "mean_token_accuracy": 0.7279322743415833, "num_tokens": 17634758.0, "step": 3209, "train/ce_loss": 0.9945276379585266 }, { "epoch": 0.3172829740953134, "step": 3209, "train/sim_loss": 0.05859375 }, { "epoch": 0.3172829740953134, "step": 3209, "train/total_loss": 0.15804651379585266 }, { "entropy": 8.883581161499023, "epoch": 0.31738184694482896, "mean_token_accuracy": 0.7705442905426025, "num_tokens": 17640397.0, "step": 3210, "train/ce_loss": 0.6207253932952881 }, { "epoch": 0.31738184694482896, "step": 3210, "train/sim_loss": 0.03125 }, { "epoch": 0.31738184694482896, "step": 3210, "train/total_loss": 0.09332254528999329 }, { "entropy": 9.226425170898438, "epoch": 0.31748071979434445, "mean_token_accuracy": 0.758186399936676, "num_tokens": 17645798.0, "step": 3211, "train/ce_loss": 0.6382618546485901 }, { "epoch": 0.31748071979434445, "step": 3211, "train/sim_loss": 0.07421875 }, { "epoch": 0.31748071979434445, "step": 3211, "train/total_loss": 0.13804493844509125 }, { "entropy": 8.876033782958984, "epoch": 0.31757959264386, "mean_token_accuracy": 0.7973273992538452, "num_tokens": 17651391.0, "step": 3212, "train/ce_loss": 0.8279364705085754 }, { "epoch": 0.31757959264386, "step": 3212, "train/sim_loss": 0.02734375 }, { "epoch": 0.31757959264386, "step": 3212, "train/total_loss": 0.11013739556074142 }, { "entropy": 8.913707733154297, "epoch": 0.31767846549337553, "mean_token_accuracy": 0.7282976508140564, "num_tokens": 17656903.0, "step": 3213, "train/ce_loss": 0.8755243420600891 }, { "epoch": 0.31767846549337553, "step": 3213, "train/sim_loss": 0.0625 }, { "epoch": 0.31767846549337553, "step": 3213, "train/total_loss": 0.15005242824554443 }, { "entropy": 8.569568634033203, "epoch": 0.31777733834289107, "mean_token_accuracy": 0.7043824791908264, "num_tokens": 17662640.0, "step": 3214, "train/ce_loss": 0.5984556674957275 }, { "epoch": 0.31777733834289107, "step": 3214, "train/sim_loss": 0.05078125 }, { "epoch": 0.31777733834289107, "step": 3214, "train/total_loss": 0.11062681674957275 }, { "entropy": 8.71242904663086, "epoch": 0.31787621119240655, "mean_token_accuracy": 0.7657142877578735, "num_tokens": 17668316.0, "step": 3215, "train/ce_loss": 1.0815380811691284 }, { "epoch": 0.31787621119240655, "step": 3215, "train/sim_loss": 0.01953125 }, { "epoch": 0.31787621119240655, "step": 3215, "train/total_loss": 0.1276850700378418 }, { "entropy": 9.09011459350586, "epoch": 0.3179750840419221, "mean_token_accuracy": 0.7562326788902283, "num_tokens": 17673671.0, "step": 3216, "train/ce_loss": 0.5091757774353027 }, { "epoch": 0.3179750840419221, "step": 3216, "train/sim_loss": 0.0546875 }, { "epoch": 0.3179750840419221, "step": 3216, "train/total_loss": 0.10560508072376251 }, { "entropy": 8.630599975585938, "epoch": 0.31807395689143764, "mean_token_accuracy": 0.7020202279090881, "num_tokens": 17679422.0, "step": 3217, "train/ce_loss": 1.0909863710403442 }, { "epoch": 0.31807395689143764, "step": 3217, "train/sim_loss": 0.07421875 }, { "epoch": 0.31807395689143764, "step": 3217, "train/total_loss": 0.1833173930644989 }, { "entropy": 9.573617935180664, "epoch": 0.3181728297409531, "mean_token_accuracy": 0.7861189842224121, "num_tokens": 17684608.0, "step": 3218, "train/ce_loss": 0.46423399448394775 }, { "epoch": 0.3181728297409531, "step": 3218, "train/sim_loss": 0.0234375 }, { "epoch": 0.3181728297409531, "step": 3218, "train/total_loss": 0.06986090540885925 }, { "entropy": 8.699023246765137, "epoch": 0.31827170259046866, "mean_token_accuracy": 0.729752779006958, "num_tokens": 17690292.0, "step": 3219, "train/ce_loss": 0.5730974078178406 }, { "epoch": 0.31827170259046866, "step": 3219, "train/sim_loss": 0.0234375 }, { "epoch": 0.31827170259046866, "step": 3219, "train/total_loss": 0.08074724674224854 }, { "epoch": 0.3183705754399842, "grad_norm": 0.6723690032958984, "learning_rate": 9.206596449587104e-06, "loss": 0.1391, "step": 3220 }, { "entropy": 9.145317077636719, "epoch": 0.3183705754399842, "mean_token_accuracy": 0.7628032565116882, "num_tokens": 17695594.0, "step": 3220, "train/ce_loss": 1.378491997718811 }, { "epoch": 0.3183705754399842, "step": 3220, "train/sim_loss": 0.05859375 }, { "epoch": 0.3183705754399842, "step": 3220, "train/total_loss": 0.19644294679164886 }, { "entropy": 9.163764953613281, "epoch": 0.3184694482894997, "mean_token_accuracy": 0.7609755992889404, "num_tokens": 17701039.0, "step": 3221, "train/ce_loss": 0.5284020304679871 }, { "epoch": 0.3184694482894997, "step": 3221, "train/sim_loss": 0.02734375 }, { "epoch": 0.3184694482894997, "step": 3221, "train/total_loss": 0.0801839530467987 }, { "entropy": 9.071619033813477, "epoch": 0.31856832113901523, "mean_token_accuracy": 0.7622193098068237, "num_tokens": 17706440.0, "step": 3222, "train/ce_loss": 0.5482897758483887 }, { "epoch": 0.31856832113901523, "step": 3222, "train/sim_loss": 0.0546875 }, { "epoch": 0.31856832113901523, "step": 3222, "train/total_loss": 0.10951647907495499 }, { "entropy": 8.61628532409668, "epoch": 0.3186671939885308, "mean_token_accuracy": 0.7733216881752014, "num_tokens": 17712426.0, "step": 3223, "train/ce_loss": 0.5864281058311462 }, { "epoch": 0.3186671939885308, "step": 3223, "train/sim_loss": 0.05859375 }, { "epoch": 0.3186671939885308, "step": 3223, "train/total_loss": 0.11723656207323074 }, { "entropy": 9.422870635986328, "epoch": 0.31876606683804626, "mean_token_accuracy": 0.7035398483276367, "num_tokens": 17717673.0, "step": 3224, "train/ce_loss": 0.951833188533783 }, { "epoch": 0.31876606683804626, "step": 3224, "train/sim_loss": 0.04296875 }, { "epoch": 0.31876606683804626, "step": 3224, "train/total_loss": 0.13815206289291382 }, { "entropy": 9.249242782592773, "epoch": 0.3188649396875618, "mean_token_accuracy": 0.7223650217056274, "num_tokens": 17723122.0, "step": 3225, "train/ce_loss": 0.6912848353385925 }, { "epoch": 0.3188649396875618, "step": 3225, "train/sim_loss": 0.0390625 }, { "epoch": 0.3188649396875618, "step": 3225, "train/total_loss": 0.10819098353385925 }, { "entropy": 8.938426971435547, "epoch": 0.31896381253707734, "mean_token_accuracy": 0.7314990758895874, "num_tokens": 17728667.0, "step": 3226, "train/ce_loss": 0.477341890335083 }, { "epoch": 0.31896381253707734, "step": 3226, "train/sim_loss": 0.0234375 }, { "epoch": 0.31896381253707734, "step": 3226, "train/total_loss": 0.07117168605327606 }, { "entropy": 9.200027465820312, "epoch": 0.3190626853865928, "mean_token_accuracy": 0.7346938848495483, "num_tokens": 17734088.0, "step": 3227, "train/ce_loss": 0.9892145395278931 }, { "epoch": 0.3190626853865928, "step": 3227, "train/sim_loss": 0.125 }, { "epoch": 0.3190626853865928, "step": 3227, "train/total_loss": 0.22392144799232483 }, { "entropy": 8.432881355285645, "epoch": 0.31916155823610837, "mean_token_accuracy": 0.7191953063011169, "num_tokens": 17739828.0, "step": 3228, "train/ce_loss": 0.7634513974189758 }, { "epoch": 0.31916155823610837, "step": 3228, "train/sim_loss": 0.09765625 }, { "epoch": 0.31916155823610837, "step": 3228, "train/total_loss": 0.17400139570236206 }, { "entropy": 8.77616024017334, "epoch": 0.3192604310856239, "mean_token_accuracy": 0.744966447353363, "num_tokens": 17745553.0, "step": 3229, "train/ce_loss": 0.23435360193252563 }, { "epoch": 0.3192604310856239, "step": 3229, "train/sim_loss": 0.03515625 }, { "epoch": 0.3192604310856239, "step": 3229, "train/total_loss": 0.05859161168336868 }, { "entropy": 9.152558326721191, "epoch": 0.3193593039351394, "mean_token_accuracy": 0.7275711297988892, "num_tokens": 17751025.0, "step": 3230, "train/ce_loss": 1.3623031377792358 }, { "epoch": 0.3193593039351394, "step": 3230, "train/sim_loss": 0.06640625 }, { "epoch": 0.3193593039351394, "step": 3230, "train/total_loss": 0.20263656973838806 }, { "entropy": 8.546022415161133, "epoch": 0.31945817678465493, "mean_token_accuracy": 0.725203275680542, "num_tokens": 17756782.0, "step": 3231, "train/ce_loss": 1.5486713647842407 }, { "epoch": 0.31945817678465493, "step": 3231, "train/sim_loss": 0.0859375 }, { "epoch": 0.31945817678465493, "step": 3231, "train/total_loss": 0.24080464243888855 }, { "entropy": 9.010370254516602, "epoch": 0.3195570496341705, "mean_token_accuracy": 0.7232510447502136, "num_tokens": 17762347.0, "step": 3232, "train/ce_loss": 1.8989828824996948 }, { "epoch": 0.3195570496341705, "step": 3232, "train/sim_loss": 0.05859375 }, { "epoch": 0.3195570496341705, "step": 3232, "train/total_loss": 0.2484920471906662 }, { "entropy": 8.955851554870605, "epoch": 0.31965592248368596, "mean_token_accuracy": 0.810234546661377, "num_tokens": 17767890.0, "step": 3233, "train/ce_loss": 0.3519980311393738 }, { "epoch": 0.31965592248368596, "step": 3233, "train/sim_loss": 0.0234375 }, { "epoch": 0.31965592248368596, "step": 3233, "train/total_loss": 0.05863730236887932 }, { "entropy": 8.93601131439209, "epoch": 0.3197547953332015, "mean_token_accuracy": 0.7654867172241211, "num_tokens": 17773421.0, "step": 3234, "train/ce_loss": 0.6126419901847839 }, { "epoch": 0.3197547953332015, "step": 3234, "train/sim_loss": 0.09375 }, { "epoch": 0.3197547953332015, "step": 3234, "train/total_loss": 0.15501420199871063 }, { "entropy": 9.103621482849121, "epoch": 0.31985366818271704, "mean_token_accuracy": 0.6915493011474609, "num_tokens": 17778697.0, "step": 3235, "train/ce_loss": 1.4339956045150757 }, { "epoch": 0.31985366818271704, "step": 3235, "train/sim_loss": 0.12109375 }, { "epoch": 0.31985366818271704, "step": 3235, "train/total_loss": 0.26449331641197205 }, { "entropy": 8.777109146118164, "epoch": 0.31995254103223253, "mean_token_accuracy": 0.696363627910614, "num_tokens": 17784419.0, "step": 3236, "train/ce_loss": 0.693494439125061 }, { "epoch": 0.31995254103223253, "step": 3236, "train/sim_loss": 0.11328125 }, { "epoch": 0.31995254103223253, "step": 3236, "train/total_loss": 0.18263068795204163 }, { "entropy": 9.114542961120605, "epoch": 0.32005141388174807, "mean_token_accuracy": 0.7381578683853149, "num_tokens": 17789824.0, "step": 3237, "train/ce_loss": 0.6705206632614136 }, { "epoch": 0.32005141388174807, "step": 3237, "train/sim_loss": 0.03125 }, { "epoch": 0.32005141388174807, "step": 3237, "train/total_loss": 0.09830206632614136 }, { "entropy": 9.128974914550781, "epoch": 0.3201502867312636, "mean_token_accuracy": 0.75738126039505, "num_tokens": 17795175.0, "step": 3238, "train/ce_loss": 0.577799379825592 }, { "epoch": 0.3201502867312636, "step": 3238, "train/sim_loss": 0.046875 }, { "epoch": 0.3201502867312636, "step": 3238, "train/total_loss": 0.1046549379825592 }, { "entropy": 9.18431282043457, "epoch": 0.3202491595807791, "mean_token_accuracy": 0.7294397950172424, "num_tokens": 17800612.0, "step": 3239, "train/ce_loss": 0.5496288537979126 }, { "epoch": 0.3202491595807791, "step": 3239, "train/sim_loss": 0.0234375 }, { "epoch": 0.3202491595807791, "step": 3239, "train/total_loss": 0.0784003883600235 }, { "epoch": 0.32034803243029464, "grad_norm": 0.7616966962814331, "learning_rate": 9.201651584829155e-06, "loss": 0.144, "step": 3240 }, { "entropy": 9.080477714538574, "epoch": 0.32034803243029464, "mean_token_accuracy": 0.7902010083198547, "num_tokens": 17805968.0, "step": 3240, "train/ce_loss": 0.7267107963562012 }, { "epoch": 0.32034803243029464, "step": 3240, "train/sim_loss": 0.05859375 }, { "epoch": 0.32034803243029464, "step": 3240, "train/total_loss": 0.1312648355960846 }, { "entropy": 9.322620391845703, "epoch": 0.3204469052798102, "mean_token_accuracy": 0.747474730014801, "num_tokens": 17811350.0, "step": 3241, "train/ce_loss": 0.7083874940872192 }, { "epoch": 0.3204469052798102, "step": 3241, "train/sim_loss": 0.03515625 }, { "epoch": 0.3204469052798102, "step": 3241, "train/total_loss": 0.10599499940872192 }, { "entropy": 9.303369522094727, "epoch": 0.32054577812932566, "mean_token_accuracy": 0.7410714030265808, "num_tokens": 17816807.0, "step": 3242, "train/ce_loss": 1.0755300521850586 }, { "epoch": 0.32054577812932566, "step": 3242, "train/sim_loss": 0.12109375 }, { "epoch": 0.32054577812932566, "step": 3242, "train/total_loss": 0.22864675521850586 }, { "entropy": 8.812226295471191, "epoch": 0.3206446509788412, "mean_token_accuracy": 0.7311272025108337, "num_tokens": 17822563.0, "step": 3243, "train/ce_loss": 0.9488539695739746 }, { "epoch": 0.3206446509788412, "step": 3243, "train/sim_loss": 0.08984375 }, { "epoch": 0.3206446509788412, "step": 3243, "train/total_loss": 0.18472915887832642 }, { "entropy": 8.79484748840332, "epoch": 0.32074352382835675, "mean_token_accuracy": 0.7179487347602844, "num_tokens": 17828091.0, "step": 3244, "train/ce_loss": 0.6520364880561829 }, { "epoch": 0.32074352382835675, "step": 3244, "train/sim_loss": 0.09375 }, { "epoch": 0.32074352382835675, "step": 3244, "train/total_loss": 0.15895365178585052 }, { "entropy": 8.942248344421387, "epoch": 0.32084239667787223, "mean_token_accuracy": 0.6970617771148682, "num_tokens": 17833708.0, "step": 3245, "train/ce_loss": 1.015682339668274 }, { "epoch": 0.32084239667787223, "step": 3245, "train/sim_loss": 0.05078125 }, { "epoch": 0.32084239667787223, "step": 3245, "train/total_loss": 0.15234948694705963 }, { "entropy": 8.722761154174805, "epoch": 0.3209412695273878, "mean_token_accuracy": 0.7392523288726807, "num_tokens": 17839454.0, "step": 3246, "train/ce_loss": 0.7049726843833923 }, { "epoch": 0.3209412695273878, "step": 3246, "train/sim_loss": 0.0859375 }, { "epoch": 0.3209412695273878, "step": 3246, "train/total_loss": 0.1564347743988037 }, { "entropy": 8.72413444519043, "epoch": 0.3210401423769033, "mean_token_accuracy": 0.7696078419685364, "num_tokens": 17845146.0, "step": 3247, "train/ce_loss": 1.2183129787445068 }, { "epoch": 0.3210401423769033, "step": 3247, "train/sim_loss": 0.05078125 }, { "epoch": 0.3210401423769033, "step": 3247, "train/total_loss": 0.17261254787445068 }, { "entropy": 8.969085693359375, "epoch": 0.3211390152264188, "mean_token_accuracy": 0.7093425393104553, "num_tokens": 17850588.0, "step": 3248, "train/ce_loss": 0.9679118394851685 }, { "epoch": 0.3211390152264188, "step": 3248, "train/sim_loss": 0.05078125 }, { "epoch": 0.3211390152264188, "step": 3248, "train/total_loss": 0.14757242798805237 }, { "entropy": 8.737431526184082, "epoch": 0.32123788807593434, "mean_token_accuracy": 0.8122568130493164, "num_tokens": 17856256.0, "step": 3249, "train/ce_loss": 0.4285179376602173 }, { "epoch": 0.32123788807593434, "step": 3249, "train/sim_loss": 0.03125 }, { "epoch": 0.32123788807593434, "step": 3249, "train/total_loss": 0.07410179078578949 }, { "entropy": 9.02369499206543, "epoch": 0.3213367609254499, "mean_token_accuracy": 0.7171464562416077, "num_tokens": 17861685.0, "step": 3250, "train/ce_loss": 0.8905037641525269 }, { "epoch": 0.3213367609254499, "step": 3250, "train/sim_loss": 0.0625 }, { "epoch": 0.3213367609254499, "step": 3250, "train/total_loss": 0.15155038237571716 }, { "entropy": 8.617911338806152, "epoch": 0.32143563377496537, "mean_token_accuracy": 0.6989351511001587, "num_tokens": 17867339.0, "step": 3251, "train/ce_loss": 0.5138656497001648 }, { "epoch": 0.32143563377496537, "step": 3251, "train/sim_loss": 0.05078125 }, { "epoch": 0.32143563377496537, "step": 3251, "train/total_loss": 0.10216781497001648 }, { "entropy": 9.026510238647461, "epoch": 0.3215345066244809, "mean_token_accuracy": 0.7535668015480042, "num_tokens": 17872643.0, "step": 3252, "train/ce_loss": 0.9442238211631775 }, { "epoch": 0.3215345066244809, "step": 3252, "train/sim_loss": 0.0546875 }, { "epoch": 0.3215345066244809, "step": 3252, "train/total_loss": 0.14910988509655 }, { "entropy": 9.144336700439453, "epoch": 0.32163337947399645, "mean_token_accuracy": 0.7348777055740356, "num_tokens": 17878041.0, "step": 3253, "train/ce_loss": 0.7527347803115845 }, { "epoch": 0.32163337947399645, "step": 3253, "train/sim_loss": 0.0703125 }, { "epoch": 0.32163337947399645, "step": 3253, "train/total_loss": 0.14558598399162292 }, { "entropy": 9.03337287902832, "epoch": 0.321732252323512, "mean_token_accuracy": 0.7634854912757874, "num_tokens": 17883396.0, "step": 3254, "train/ce_loss": 0.5834844708442688 }, { "epoch": 0.321732252323512, "step": 3254, "train/sim_loss": 0.0703125 }, { "epoch": 0.321732252323512, "step": 3254, "train/total_loss": 0.12866094708442688 }, { "entropy": 9.1356840133667, "epoch": 0.3218311251730275, "mean_token_accuracy": 0.7561797499656677, "num_tokens": 17888938.0, "step": 3255, "train/ce_loss": 1.075952410697937 }, { "epoch": 0.3218311251730275, "step": 3255, "train/sim_loss": 0.0546875 }, { "epoch": 0.3218311251730275, "step": 3255, "train/total_loss": 0.16228273510932922 }, { "entropy": 9.064414978027344, "epoch": 0.321929998022543, "mean_token_accuracy": 0.7339003682136536, "num_tokens": 17894541.0, "step": 3256, "train/ce_loss": 0.7477004528045654 }, { "epoch": 0.321929998022543, "step": 3256, "train/sim_loss": 0.0390625 }, { "epoch": 0.321929998022543, "step": 3256, "train/total_loss": 0.11383254826068878 }, { "entropy": 9.02135944366455, "epoch": 0.32202887087205856, "mean_token_accuracy": 0.7171609997749329, "num_tokens": 17900036.0, "step": 3257, "train/ce_loss": 1.012001395225525 }, { "epoch": 0.32202887087205856, "step": 3257, "train/sim_loss": 0.04296875 }, { "epoch": 0.32202887087205856, "step": 3257, "train/total_loss": 0.144168883562088 }, { "entropy": 8.858884811401367, "epoch": 0.32212774372157404, "mean_token_accuracy": 0.7063020467758179, "num_tokens": 17905428.0, "step": 3258, "train/ce_loss": 0.9272046089172363 }, { "epoch": 0.32212774372157404, "step": 3258, "train/sim_loss": 0.0703125 }, { "epoch": 0.32212774372157404, "step": 3258, "train/total_loss": 0.16303296387195587 }, { "entropy": 8.863408088684082, "epoch": 0.3222266165710896, "mean_token_accuracy": 0.7256729006767273, "num_tokens": 17910960.0, "step": 3259, "train/ce_loss": 0.7634143829345703 }, { "epoch": 0.3222266165710896, "step": 3259, "train/sim_loss": 0.06640625 }, { "epoch": 0.3222266165710896, "step": 3259, "train/total_loss": 0.142747700214386 }, { "epoch": 0.3223254894206051, "grad_norm": 0.9623202681541443, "learning_rate": 9.196706720071207e-06, "loss": 0.149, "step": 3260 }, { "entropy": 8.694595336914062, "epoch": 0.3223254894206051, "mean_token_accuracy": 0.7818671464920044, "num_tokens": 17916764.0, "step": 3260, "train/ce_loss": 0.36998075246810913 }, { "epoch": 0.3223254894206051, "step": 3260, "train/sim_loss": 0.0234375 }, { "epoch": 0.3223254894206051, "step": 3260, "train/total_loss": 0.06043557450175285 }, { "entropy": 8.77747917175293, "epoch": 0.3224243622701206, "mean_token_accuracy": 0.7401197552680969, "num_tokens": 17922253.0, "step": 3261, "train/ce_loss": 1.102950096130371 }, { "epoch": 0.3224243622701206, "step": 3261, "train/sim_loss": 0.0703125 }, { "epoch": 0.3224243622701206, "step": 3261, "train/total_loss": 0.18060751259326935 }, { "entropy": 9.176493644714355, "epoch": 0.32252323511963615, "mean_token_accuracy": 0.7181603908538818, "num_tokens": 17927746.0, "step": 3262, "train/ce_loss": 0.8128913044929504 }, { "epoch": 0.32252323511963615, "step": 3262, "train/sim_loss": 0.09765625 }, { "epoch": 0.32252323511963615, "step": 3262, "train/total_loss": 0.178945392370224 }, { "entropy": 9.317118644714355, "epoch": 0.3226221079691517, "mean_token_accuracy": 0.7426108121871948, "num_tokens": 17933086.0, "step": 3263, "train/ce_loss": 0.7137615084648132 }, { "epoch": 0.3226221079691517, "step": 3263, "train/sim_loss": 0.125 }, { "epoch": 0.3226221079691517, "step": 3263, "train/total_loss": 0.19637614488601685 }, { "entropy": 8.962414741516113, "epoch": 0.3227209808186672, "mean_token_accuracy": 0.7406542301177979, "num_tokens": 17938557.0, "step": 3264, "train/ce_loss": 0.6506192088127136 }, { "epoch": 0.3227209808186672, "step": 3264, "train/sim_loss": 0.0390625 }, { "epoch": 0.3227209808186672, "step": 3264, "train/total_loss": 0.10412441939115524 }, { "entropy": 9.385013580322266, "epoch": 0.3228198536681827, "mean_token_accuracy": 0.7754532694816589, "num_tokens": 17943778.0, "step": 3265, "train/ce_loss": 0.6763017177581787 }, { "epoch": 0.3228198536681827, "step": 3265, "train/sim_loss": 0.05078125 }, { "epoch": 0.3228198536681827, "step": 3265, "train/total_loss": 0.11841142177581787 }, { "entropy": 8.863534927368164, "epoch": 0.32291872651769826, "mean_token_accuracy": 0.7219512462615967, "num_tokens": 17949183.0, "step": 3266, "train/ce_loss": 0.543550431728363 }, { "epoch": 0.32291872651769826, "step": 3266, "train/sim_loss": 0.0390625 }, { "epoch": 0.32291872651769826, "step": 3266, "train/total_loss": 0.09341754019260406 }, { "entropy": 9.422264099121094, "epoch": 0.32301759936721375, "mean_token_accuracy": 0.7307171821594238, "num_tokens": 17954498.0, "step": 3267, "train/ce_loss": 0.7189306020736694 }, { "epoch": 0.32301759936721375, "step": 3267, "train/sim_loss": 0.03125 }, { "epoch": 0.32301759936721375, "step": 3267, "train/total_loss": 0.10314305871725082 }, { "entropy": 8.962303161621094, "epoch": 0.3231164722167293, "mean_token_accuracy": 0.794731080532074, "num_tokens": 17960034.0, "step": 3268, "train/ce_loss": 0.6643767952919006 }, { "epoch": 0.3231164722167293, "step": 3268, "train/sim_loss": 0.07421875 }, { "epoch": 0.3231164722167293, "step": 3268, "train/total_loss": 0.14065644145011902 }, { "entropy": 9.09814167022705, "epoch": 0.32321534506624483, "mean_token_accuracy": 0.794369637966156, "num_tokens": 17965521.0, "step": 3269, "train/ce_loss": 0.5965721011161804 }, { "epoch": 0.32321534506624483, "step": 3269, "train/sim_loss": 0.0234375 }, { "epoch": 0.32321534506624483, "step": 3269, "train/total_loss": 0.08309471607208252 }, { "entropy": 8.69002914428711, "epoch": 0.3233142179157603, "mean_token_accuracy": 0.7149446606636047, "num_tokens": 17971196.0, "step": 3270, "train/ce_loss": 0.47946059703826904 }, { "epoch": 0.3233142179157603, "step": 3270, "train/sim_loss": 0.05078125 }, { "epoch": 0.3233142179157603, "step": 3270, "train/total_loss": 0.09872731566429138 }, { "entropy": 9.08627986907959, "epoch": 0.32341309076527586, "mean_token_accuracy": 0.746583878993988, "num_tokens": 17976561.0, "step": 3271, "train/ce_loss": 0.585972249507904 }, { "epoch": 0.32341309076527586, "step": 3271, "train/sim_loss": 0.01953125 }, { "epoch": 0.32341309076527586, "step": 3271, "train/total_loss": 0.07812847197055817 }, { "entropy": 9.079699516296387, "epoch": 0.3235119636147914, "mean_token_accuracy": 0.725239634513855, "num_tokens": 17982008.0, "step": 3272, "train/ce_loss": 0.610024631023407 }, { "epoch": 0.3235119636147914, "step": 3272, "train/sim_loss": 0.03125 }, { "epoch": 0.3235119636147914, "step": 3272, "train/total_loss": 0.0922524631023407 }, { "entropy": 8.63966178894043, "epoch": 0.3236108364643069, "mean_token_accuracy": 0.7546296119689941, "num_tokens": 17987499.0, "step": 3273, "train/ce_loss": 0.8584446310997009 }, { "epoch": 0.3236108364643069, "step": 3273, "train/sim_loss": 0.06640625 }, { "epoch": 0.3236108364643069, "step": 3273, "train/total_loss": 0.15225070714950562 }, { "entropy": 9.0169038772583, "epoch": 0.3237097093138224, "mean_token_accuracy": 0.7294871807098389, "num_tokens": 17992885.0, "step": 3274, "train/ce_loss": 0.9439672231674194 }, { "epoch": 0.3237097093138224, "step": 3274, "train/sim_loss": 0.0625 }, { "epoch": 0.3237097093138224, "step": 3274, "train/total_loss": 0.15689672529697418 }, { "entropy": 9.10972785949707, "epoch": 0.32380858216333797, "mean_token_accuracy": 0.7599545121192932, "num_tokens": 17998350.0, "step": 3275, "train/ce_loss": 1.0027294158935547 }, { "epoch": 0.32380858216333797, "step": 3275, "train/sim_loss": 0.0546875 }, { "epoch": 0.32380858216333797, "step": 3275, "train/total_loss": 0.15496045351028442 }, { "entropy": 8.838521957397461, "epoch": 0.32390745501285345, "mean_token_accuracy": 0.7580645084381104, "num_tokens": 18004029.0, "step": 3276, "train/ce_loss": 0.8699126839637756 }, { "epoch": 0.32390745501285345, "step": 3276, "train/sim_loss": 0.0625 }, { "epoch": 0.32390745501285345, "step": 3276, "train/total_loss": 0.14949128031730652 }, { "entropy": 9.270792007446289, "epoch": 0.324006327862369, "mean_token_accuracy": 0.7515528202056885, "num_tokens": 18009297.0, "step": 3277, "train/ce_loss": 0.6562734246253967 }, { "epoch": 0.324006327862369, "step": 3277, "train/sim_loss": 0.09375 }, { "epoch": 0.324006327862369, "step": 3277, "train/total_loss": 0.1593773365020752 }, { "entropy": 8.945789337158203, "epoch": 0.32410520071188453, "mean_token_accuracy": 0.7251184582710266, "num_tokens": 18015164.0, "step": 3278, "train/ce_loss": 0.6296507716178894 }, { "epoch": 0.32410520071188453, "step": 3278, "train/sim_loss": 0.0546875 }, { "epoch": 0.32410520071188453, "step": 3278, "train/total_loss": 0.11765258014202118 }, { "entropy": 9.322494506835938, "epoch": 0.3242040735614, "mean_token_accuracy": 0.7481909990310669, "num_tokens": 18020483.0, "step": 3279, "train/ce_loss": 0.6504072546958923 }, { "epoch": 0.3242040735614, "step": 3279, "train/sim_loss": 0.03515625 }, { "epoch": 0.3242040735614, "step": 3279, "train/total_loss": 0.10019697993993759 }, { "epoch": 0.32430294641091556, "grad_norm": 0.8743143677711487, "learning_rate": 9.191761855313257e-06, "loss": 0.1457, "step": 3280 }, { "entropy": 8.537601470947266, "epoch": 0.32430294641091556, "mean_token_accuracy": 0.6843910813331604, "num_tokens": 18026220.0, "step": 3280, "train/ce_loss": 0.8086438179016113 }, { "epoch": 0.32430294641091556, "step": 3280, "train/sim_loss": 0.0703125 }, { "epoch": 0.32430294641091556, "step": 3280, "train/total_loss": 0.15117688477039337 }, { "entropy": 8.791933059692383, "epoch": 0.3244018192604311, "mean_token_accuracy": 0.8414141535758972, "num_tokens": 18031896.0, "step": 3281, "train/ce_loss": 0.9175693988800049 }, { "epoch": 0.3244018192604311, "step": 3281, "train/sim_loss": 0.16796875 }, { "epoch": 0.3244018192604311, "step": 3281, "train/total_loss": 0.2597256898880005 }, { "entropy": 9.275796890258789, "epoch": 0.3245006921099466, "mean_token_accuracy": 0.7872892618179321, "num_tokens": 18037235.0, "step": 3282, "train/ce_loss": 0.6511682271957397 }, { "epoch": 0.3245006921099466, "step": 3282, "train/sim_loss": 0.15625 }, { "epoch": 0.3245006921099466, "step": 3282, "train/total_loss": 0.22136682271957397 }, { "entropy": 9.120903015136719, "epoch": 0.32459956495946213, "mean_token_accuracy": 0.762326180934906, "num_tokens": 18042693.0, "step": 3283, "train/ce_loss": 0.32227933406829834 }, { "epoch": 0.32459956495946213, "step": 3283, "train/sim_loss": 0.01953125 }, { "epoch": 0.32459956495946213, "step": 3283, "train/total_loss": 0.051759183406829834 }, { "entropy": 8.962547302246094, "epoch": 0.32469843780897767, "mean_token_accuracy": 0.7881165742874146, "num_tokens": 18048112.0, "step": 3284, "train/ce_loss": 0.7206968069076538 }, { "epoch": 0.32469843780897767, "step": 3284, "train/sim_loss": 0.046875 }, { "epoch": 0.32469843780897767, "step": 3284, "train/total_loss": 0.1189446821808815 }, { "entropy": 8.888264656066895, "epoch": 0.32479731065849315, "mean_token_accuracy": 0.7076735496520996, "num_tokens": 18053604.0, "step": 3285, "train/ce_loss": 0.6786348223686218 }, { "epoch": 0.32479731065849315, "step": 3285, "train/sim_loss": 0.05078125 }, { "epoch": 0.32479731065849315, "step": 3285, "train/total_loss": 0.11864473670721054 }, { "entropy": 8.590400695800781, "epoch": 0.3248961835080087, "mean_token_accuracy": 0.7830578684806824, "num_tokens": 18059185.0, "step": 3286, "train/ce_loss": 0.5936170816421509 }, { "epoch": 0.3248961835080087, "step": 3286, "train/sim_loss": 0.02734375 }, { "epoch": 0.3248961835080087, "step": 3286, "train/total_loss": 0.08670546114444733 }, { "entropy": 8.569120407104492, "epoch": 0.32499505635752424, "mean_token_accuracy": 0.7363184094429016, "num_tokens": 18064880.0, "step": 3287, "train/ce_loss": 0.8246201872825623 }, { "epoch": 0.32499505635752424, "step": 3287, "train/sim_loss": 0.09765625 }, { "epoch": 0.32499505635752424, "step": 3287, "train/total_loss": 0.18011826276779175 }, { "entropy": 9.03376579284668, "epoch": 0.3250939292070397, "mean_token_accuracy": 0.7502714395523071, "num_tokens": 18070389.0, "step": 3288, "train/ce_loss": 0.4990786015987396 }, { "epoch": 0.3250939292070397, "step": 3288, "train/sim_loss": 0.01953125 }, { "epoch": 0.3250939292070397, "step": 3288, "train/total_loss": 0.0694391131401062 }, { "entropy": 9.039407730102539, "epoch": 0.32519280205655526, "mean_token_accuracy": 0.7595375776290894, "num_tokens": 18075879.0, "step": 3289, "train/ce_loss": 0.3749963343143463 }, { "epoch": 0.32519280205655526, "step": 3289, "train/sim_loss": 0.04296875 }, { "epoch": 0.32519280205655526, "step": 3289, "train/total_loss": 0.08046838641166687 }, { "entropy": 9.300722122192383, "epoch": 0.3252916749060708, "mean_token_accuracy": 0.744107723236084, "num_tokens": 18081115.0, "step": 3290, "train/ce_loss": 0.5895193815231323 }, { "epoch": 0.3252916749060708, "step": 3290, "train/sim_loss": 0.0546875 }, { "epoch": 0.3252916749060708, "step": 3290, "train/total_loss": 0.11363944411277771 }, { "entropy": 9.019305229187012, "epoch": 0.3253905477555863, "mean_token_accuracy": 0.774193525314331, "num_tokens": 18086505.0, "step": 3291, "train/ce_loss": 0.477668821811676 }, { "epoch": 0.3253905477555863, "step": 3291, "train/sim_loss": 0.0546875 }, { "epoch": 0.3253905477555863, "step": 3291, "train/total_loss": 0.10245437920093536 }, { "entropy": 8.731000900268555, "epoch": 0.32548942060510183, "mean_token_accuracy": 0.7107279896736145, "num_tokens": 18092217.0, "step": 3292, "train/ce_loss": 0.7381085753440857 }, { "epoch": 0.32548942060510183, "step": 3292, "train/sim_loss": 0.06640625 }, { "epoch": 0.32548942060510183, "step": 3292, "train/total_loss": 0.1402171105146408 }, { "entropy": 8.998296737670898, "epoch": 0.3255882934546174, "mean_token_accuracy": 0.8094144463539124, "num_tokens": 18097711.0, "step": 3293, "train/ce_loss": 0.5393453240394592 }, { "epoch": 0.3255882934546174, "step": 3293, "train/sim_loss": 0.05078125 }, { "epoch": 0.3255882934546174, "step": 3293, "train/total_loss": 0.10471577942371368 }, { "entropy": 8.998517990112305, "epoch": 0.32568716630413286, "mean_token_accuracy": 0.8129032254219055, "num_tokens": 18103198.0, "step": 3294, "train/ce_loss": 0.4773982763290405 }, { "epoch": 0.32568716630413286, "step": 3294, "train/sim_loss": 0.0234375 }, { "epoch": 0.32568716630413286, "step": 3294, "train/total_loss": 0.07117733359336853 }, { "entropy": 9.210780143737793, "epoch": 0.3257860391536484, "mean_token_accuracy": 0.7668965458869934, "num_tokens": 18108487.0, "step": 3295, "train/ce_loss": 0.781674325466156 }, { "epoch": 0.3257860391536484, "step": 3295, "train/sim_loss": 0.0546875 }, { "epoch": 0.3257860391536484, "step": 3295, "train/total_loss": 0.13285493850708008 }, { "entropy": 9.082758903503418, "epoch": 0.32588491200316394, "mean_token_accuracy": 0.7076719403266907, "num_tokens": 18113771.0, "step": 3296, "train/ce_loss": 0.6265847682952881 }, { "epoch": 0.32588491200316394, "step": 3296, "train/sim_loss": 0.0234375 }, { "epoch": 0.32588491200316394, "step": 3296, "train/total_loss": 0.08609598129987717 }, { "entropy": 8.600619316101074, "epoch": 0.3259837848526795, "mean_token_accuracy": 0.7264297008514404, "num_tokens": 18119615.0, "step": 3297, "train/ce_loss": 0.42297589778900146 }, { "epoch": 0.3259837848526795, "step": 3297, "train/sim_loss": 0.0703125 }, { "epoch": 0.3259837848526795, "step": 3297, "train/total_loss": 0.11261008679866791 }, { "entropy": 8.940589904785156, "epoch": 0.32608265770219497, "mean_token_accuracy": 0.7842605113983154, "num_tokens": 18124961.0, "step": 3298, "train/ce_loss": 0.903731107711792 }, { "epoch": 0.32608265770219497, "step": 3298, "train/sim_loss": 0.12890625 }, { "epoch": 0.32608265770219497, "step": 3298, "train/total_loss": 0.21927936375141144 }, { "entropy": 8.751570701599121, "epoch": 0.3261815305517105, "mean_token_accuracy": 0.7505543231964111, "num_tokens": 18130567.0, "step": 3299, "train/ce_loss": 1.146727442741394 }, { "epoch": 0.3261815305517105, "step": 3299, "train/sim_loss": 0.08203125 }, { "epoch": 0.3261815305517105, "step": 3299, "train/total_loss": 0.19670400023460388 }, { "epoch": 0.32628040340122605, "grad_norm": 0.7953982949256897, "learning_rate": 9.18681699055531e-06, "loss": 0.1357, "step": 3300 }, { "entropy": 8.922477722167969, "epoch": 0.32628040340122605, "mean_token_accuracy": 0.7740174531936646, "num_tokens": 18136084.0, "step": 3300, "train/ce_loss": 0.8472011685371399 }, { "epoch": 0.32628040340122605, "step": 3300, "train/sim_loss": 0.109375 }, { "epoch": 0.32628040340122605, "step": 3300, "train/total_loss": 0.19409511983394623 }, { "entropy": 9.132375717163086, "epoch": 0.32637927625074153, "mean_token_accuracy": 0.7278177738189697, "num_tokens": 18141510.0, "step": 3301, "train/ce_loss": 0.6210150718688965 }, { "epoch": 0.32637927625074153, "step": 3301, "train/sim_loss": 0.03125 }, { "epoch": 0.32637927625074153, "step": 3301, "train/total_loss": 0.09335151314735413 }, { "entropy": 8.70724105834961, "epoch": 0.3264781491002571, "mean_token_accuracy": 0.7838428020477295, "num_tokens": 18147118.0, "step": 3302, "train/ce_loss": 0.6362457871437073 }, { "epoch": 0.3264781491002571, "step": 3302, "train/sim_loss": 0.10546875 }, { "epoch": 0.3264781491002571, "step": 3302, "train/total_loss": 0.16909334063529968 }, { "entropy": 8.94888687133789, "epoch": 0.3265770219497726, "mean_token_accuracy": 0.7337142825126648, "num_tokens": 18152614.0, "step": 3303, "train/ce_loss": 0.8989349603652954 }, { "epoch": 0.3265770219497726, "step": 3303, "train/sim_loss": 0.0546875 }, { "epoch": 0.3265770219497726, "step": 3303, "train/total_loss": 0.14458099007606506 }, { "entropy": 8.655538558959961, "epoch": 0.3266758947992881, "mean_token_accuracy": 0.691033124923706, "num_tokens": 18158300.0, "step": 3304, "train/ce_loss": 1.126186728477478 }, { "epoch": 0.3266758947992881, "step": 3304, "train/sim_loss": 0.09375 }, { "epoch": 0.3266758947992881, "step": 3304, "train/total_loss": 0.20636868476867676 }, { "entropy": 9.213927268981934, "epoch": 0.32677476764880364, "mean_token_accuracy": 0.8050458431243896, "num_tokens": 18163728.0, "step": 3305, "train/ce_loss": 0.29916825890541077 }, { "epoch": 0.32677476764880364, "step": 3305, "train/sim_loss": 0.015625 }, { "epoch": 0.32677476764880364, "step": 3305, "train/total_loss": 0.045541826635599136 }, { "entropy": 8.833087921142578, "epoch": 0.3268736404983192, "mean_token_accuracy": 0.7292993664741516, "num_tokens": 18169300.0, "step": 3306, "train/ce_loss": 1.341254711151123 }, { "epoch": 0.3268736404983192, "step": 3306, "train/sim_loss": 0.0859375 }, { "epoch": 0.3268736404983192, "step": 3306, "train/total_loss": 0.2200629711151123 }, { "entropy": 8.849204063415527, "epoch": 0.32697251334783467, "mean_token_accuracy": 0.7310194969177246, "num_tokens": 18174839.0, "step": 3307, "train/ce_loss": 0.905493438243866 }, { "epoch": 0.32697251334783467, "step": 3307, "train/sim_loss": 0.109375 }, { "epoch": 0.32697251334783467, "step": 3307, "train/total_loss": 0.19992434978485107 }, { "entropy": 9.439088821411133, "epoch": 0.3270713861973502, "mean_token_accuracy": 0.7325728535652161, "num_tokens": 18180195.0, "step": 3308, "train/ce_loss": 0.942559003829956 }, { "epoch": 0.3270713861973502, "step": 3308, "train/sim_loss": 0.046875 }, { "epoch": 0.3270713861973502, "step": 3308, "train/total_loss": 0.14113089442253113 }, { "entropy": 9.08864688873291, "epoch": 0.32717025904686575, "mean_token_accuracy": 0.7572706937789917, "num_tokens": 18185753.0, "step": 3309, "train/ce_loss": 0.6623948812484741 }, { "epoch": 0.32717025904686575, "step": 3309, "train/sim_loss": 0.0625 }, { "epoch": 0.32717025904686575, "step": 3309, "train/total_loss": 0.12873949110507965 }, { "entropy": 8.727947235107422, "epoch": 0.32726913189638124, "mean_token_accuracy": 0.7682317495346069, "num_tokens": 18191403.0, "step": 3310, "train/ce_loss": 0.5689936280250549 }, { "epoch": 0.32726913189638124, "step": 3310, "train/sim_loss": 0.04296875 }, { "epoch": 0.32726913189638124, "step": 3310, "train/total_loss": 0.09986811876296997 }, { "entropy": 8.751195907592773, "epoch": 0.3273680047458968, "mean_token_accuracy": 0.7520891427993774, "num_tokens": 18197095.0, "step": 3311, "train/ce_loss": 1.4523690938949585 }, { "epoch": 0.3273680047458968, "step": 3311, "train/sim_loss": 0.0625 }, { "epoch": 0.3273680047458968, "step": 3311, "train/total_loss": 0.20773690938949585 }, { "entropy": 8.935558319091797, "epoch": 0.3274668775954123, "mean_token_accuracy": 0.7469478249549866, "num_tokens": 18202673.0, "step": 3312, "train/ce_loss": 0.644382894039154 }, { "epoch": 0.3274668775954123, "step": 3312, "train/sim_loss": 0.03125 }, { "epoch": 0.3274668775954123, "step": 3312, "train/total_loss": 0.09568829089403152 }, { "entropy": 9.24250602722168, "epoch": 0.3275657504449278, "mean_token_accuracy": 0.769444465637207, "num_tokens": 18207974.0, "step": 3313, "train/ce_loss": 0.8038400411605835 }, { "epoch": 0.3275657504449278, "step": 3313, "train/sim_loss": 0.05859375 }, { "epoch": 0.3275657504449278, "step": 3313, "train/total_loss": 0.1389777660369873 }, { "entropy": 9.025284767150879, "epoch": 0.32766462329444335, "mean_token_accuracy": 0.7398273944854736, "num_tokens": 18213390.0, "step": 3314, "train/ce_loss": 0.6775871515274048 }, { "epoch": 0.32766462329444335, "step": 3314, "train/sim_loss": 0.05078125 }, { "epoch": 0.32766462329444335, "step": 3314, "train/total_loss": 0.1185399666428566 }, { "entropy": 9.035704612731934, "epoch": 0.3277634961439589, "mean_token_accuracy": 0.7010443806648254, "num_tokens": 18218738.0, "step": 3315, "train/ce_loss": 0.8081430792808533 }, { "epoch": 0.3277634961439589, "step": 3315, "train/sim_loss": 0.04296875 }, { "epoch": 0.3277634961439589, "step": 3315, "train/total_loss": 0.12378305941820145 }, { "entropy": 9.188863754272461, "epoch": 0.3278623689934744, "mean_token_accuracy": 0.7696139216423035, "num_tokens": 18224118.0, "step": 3316, "train/ce_loss": 0.9755063652992249 }, { "epoch": 0.3278623689934744, "step": 3316, "train/sim_loss": 0.1484375 }, { "epoch": 0.3278623689934744, "step": 3316, "train/total_loss": 0.245988130569458 }, { "entropy": 8.440709114074707, "epoch": 0.3279612418429899, "mean_token_accuracy": 0.7188888788223267, "num_tokens": 18229730.0, "step": 3317, "train/ce_loss": 1.5816798210144043 }, { "epoch": 0.3279612418429899, "step": 3317, "train/sim_loss": 0.12890625 }, { "epoch": 0.3279612418429899, "step": 3317, "train/total_loss": 0.2870742380619049 }, { "entropy": 8.91847038269043, "epoch": 0.32806011469250546, "mean_token_accuracy": 0.7640449404716492, "num_tokens": 18235275.0, "step": 3318, "train/ce_loss": 0.8242001533508301 }, { "epoch": 0.32806011469250546, "step": 3318, "train/sim_loss": 0.09375 }, { "epoch": 0.32806011469250546, "step": 3318, "train/total_loss": 0.17617002129554749 }, { "entropy": 8.862126350402832, "epoch": 0.32815898754202094, "mean_token_accuracy": 0.711416482925415, "num_tokens": 18240874.0, "step": 3319, "train/ce_loss": 0.3245711326599121 }, { "epoch": 0.32815898754202094, "step": 3319, "train/sim_loss": 0.05859375 }, { "epoch": 0.32815898754202094, "step": 3319, "train/total_loss": 0.09105086326599121 }, { "epoch": 0.3282578603915365, "grad_norm": 0.7884343266487122, "learning_rate": 9.18187212579736e-06, "loss": 0.1511, "step": 3320 }, { "entropy": 9.164953231811523, "epoch": 0.3282578603915365, "mean_token_accuracy": 0.7247956395149231, "num_tokens": 18246128.0, "step": 3320, "train/ce_loss": 1.2741084098815918 }, { "epoch": 0.3282578603915365, "step": 3320, "train/sim_loss": 0.05859375 }, { "epoch": 0.3282578603915365, "step": 3320, "train/total_loss": 0.18600459396839142 }, { "entropy": 9.216476440429688, "epoch": 0.328356733241052, "mean_token_accuracy": 0.7596513032913208, "num_tokens": 18251534.0, "step": 3321, "train/ce_loss": 0.9277222156524658 }, { "epoch": 0.328356733241052, "step": 3321, "train/sim_loss": 0.06640625 }, { "epoch": 0.328356733241052, "step": 3321, "train/total_loss": 0.1591784656047821 }, { "entropy": 8.632231712341309, "epoch": 0.3284556060905675, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 18257390.0, "step": 3322, "train/ce_loss": 0.32313722372055054 }, { "epoch": 0.3284556060905675, "step": 3322, "train/sim_loss": 0.0546875 }, { "epoch": 0.3284556060905675, "step": 3322, "train/total_loss": 0.08700121939182281 }, { "entropy": 8.837331771850586, "epoch": 0.32855447894008305, "mean_token_accuracy": 0.7614436745643616, "num_tokens": 18263107.0, "step": 3323, "train/ce_loss": 0.2447875589132309 }, { "epoch": 0.32855447894008305, "step": 3323, "train/sim_loss": 0.0234375 }, { "epoch": 0.32855447894008305, "step": 3323, "train/total_loss": 0.04791625589132309 }, { "entropy": 9.103507041931152, "epoch": 0.3286533517895986, "mean_token_accuracy": 0.756313145160675, "num_tokens": 18268571.0, "step": 3324, "train/ce_loss": 1.02525794506073 }, { "epoch": 0.3286533517895986, "step": 3324, "train/sim_loss": 0.0546875 }, { "epoch": 0.3286533517895986, "step": 3324, "train/total_loss": 0.15721330046653748 }, { "entropy": 8.538156509399414, "epoch": 0.3287522246391141, "mean_token_accuracy": 0.7201257944107056, "num_tokens": 18274396.0, "step": 3325, "train/ce_loss": 1.638105869293213 }, { "epoch": 0.3287522246391141, "step": 3325, "train/sim_loss": 0.0546875 }, { "epoch": 0.3287522246391141, "step": 3325, "train/total_loss": 0.218498095870018 }, { "entropy": 9.299535751342773, "epoch": 0.3288510974886296, "mean_token_accuracy": 0.7316784858703613, "num_tokens": 18279847.0, "step": 3326, "train/ce_loss": 0.9666756987571716 }, { "epoch": 0.3288510974886296, "step": 3326, "train/sim_loss": 0.06640625 }, { "epoch": 0.3288510974886296, "step": 3326, "train/total_loss": 0.1630738228559494 }, { "entropy": 8.695920944213867, "epoch": 0.32894997033814516, "mean_token_accuracy": 0.7157569527626038, "num_tokens": 18285408.0, "step": 3327, "train/ce_loss": 0.7027592062950134 }, { "epoch": 0.32894997033814516, "step": 3327, "train/sim_loss": 0.0546875 }, { "epoch": 0.32894997033814516, "step": 3327, "train/total_loss": 0.1249634250998497 }, { "entropy": 9.011902809143066, "epoch": 0.32904884318766064, "mean_token_accuracy": 0.844953179359436, "num_tokens": 18291000.0, "step": 3328, "train/ce_loss": 0.6668858528137207 }, { "epoch": 0.32904884318766064, "step": 3328, "train/sim_loss": 0.08984375 }, { "epoch": 0.32904884318766064, "step": 3328, "train/total_loss": 0.15653234720230103 }, { "entropy": 8.9949369430542, "epoch": 0.3291477160371762, "mean_token_accuracy": 0.7248018383979797, "num_tokens": 18296472.0, "step": 3329, "train/ce_loss": 0.6475987434387207 }, { "epoch": 0.3291477160371762, "step": 3329, "train/sim_loss": 0.05078125 }, { "epoch": 0.3291477160371762, "step": 3329, "train/total_loss": 0.11554112285375595 }, { "entropy": 8.579181671142578, "epoch": 0.3292465888866917, "mean_token_accuracy": 0.741525411605835, "num_tokens": 18302252.0, "step": 3330, "train/ce_loss": 1.3388166427612305 }, { "epoch": 0.3292465888866917, "step": 3330, "train/sim_loss": 0.05078125 }, { "epoch": 0.3292465888866917, "step": 3330, "train/total_loss": 0.18466292321681976 }, { "entropy": 8.847908020019531, "epoch": 0.3293454617362072, "mean_token_accuracy": 0.7729566097259521, "num_tokens": 18307890.0, "step": 3331, "train/ce_loss": 0.8516255617141724 }, { "epoch": 0.3293454617362072, "step": 3331, "train/sim_loss": 0.078125 }, { "epoch": 0.3293454617362072, "step": 3331, "train/total_loss": 0.16328755021095276 }, { "entropy": 9.016305923461914, "epoch": 0.32944433458572275, "mean_token_accuracy": 0.7805642485618591, "num_tokens": 18313529.0, "step": 3332, "train/ce_loss": 0.5196484327316284 }, { "epoch": 0.32944433458572275, "step": 3332, "train/sim_loss": 0.0703125 }, { "epoch": 0.32944433458572275, "step": 3332, "train/total_loss": 0.12227734923362732 }, { "entropy": 8.981419563293457, "epoch": 0.3295432074352383, "mean_token_accuracy": 0.7289940714836121, "num_tokens": 18318949.0, "step": 3333, "train/ce_loss": 0.6052286624908447 }, { "epoch": 0.3295432074352383, "step": 3333, "train/sim_loss": 0.04296875 }, { "epoch": 0.3295432074352383, "step": 3333, "train/total_loss": 0.10349161922931671 }, { "entropy": 9.238487243652344, "epoch": 0.3296420802847538, "mean_token_accuracy": 0.7923076748847961, "num_tokens": 18324317.0, "step": 3334, "train/ce_loss": 0.5796465277671814 }, { "epoch": 0.3296420802847538, "step": 3334, "train/sim_loss": 0.0390625 }, { "epoch": 0.3296420802847538, "step": 3334, "train/total_loss": 0.09702715277671814 }, { "entropy": 9.094803810119629, "epoch": 0.3297409531342693, "mean_token_accuracy": 0.7376294732093811, "num_tokens": 18329860.0, "step": 3335, "train/ce_loss": 0.6193069219589233 }, { "epoch": 0.3297409531342693, "step": 3335, "train/sim_loss": 0.07421875 }, { "epoch": 0.3297409531342693, "step": 3335, "train/total_loss": 0.13614943623542786 }, { "entropy": 9.007802963256836, "epoch": 0.32983982598378486, "mean_token_accuracy": 0.7424242496490479, "num_tokens": 18335356.0, "step": 3336, "train/ce_loss": 0.8635463714599609 }, { "epoch": 0.32983982598378486, "step": 3336, "train/sim_loss": 0.0390625 }, { "epoch": 0.32983982598378486, "step": 3336, "train/total_loss": 0.12541714310646057 }, { "entropy": 8.665424346923828, "epoch": 0.3299386988333004, "mean_token_accuracy": 0.7522830963134766, "num_tokens": 18341015.0, "step": 3337, "train/ce_loss": 0.8347327709197998 }, { "epoch": 0.3299386988333004, "step": 3337, "train/sim_loss": 0.0625 }, { "epoch": 0.3299386988333004, "step": 3337, "train/total_loss": 0.14597328007221222 }, { "entropy": 8.819108009338379, "epoch": 0.3300375716828159, "mean_token_accuracy": 0.7445708513259888, "num_tokens": 18346581.0, "step": 3338, "train/ce_loss": 0.2879335284233093 }, { "epoch": 0.3300375716828159, "step": 3338, "train/sim_loss": 0.0234375 }, { "epoch": 0.3300375716828159, "step": 3338, "train/total_loss": 0.05223085358738899 }, { "entropy": 8.792055130004883, "epoch": 0.33013644453233143, "mean_token_accuracy": 0.7660642862319946, "num_tokens": 18352244.0, "step": 3339, "train/ce_loss": 0.46801137924194336 }, { "epoch": 0.33013644453233143, "step": 3339, "train/sim_loss": 0.0390625 }, { "epoch": 0.33013644453233143, "step": 3339, "train/total_loss": 0.0858636349439621 }, { "epoch": 0.33023531738184697, "grad_norm": 0.7758581042289734, "learning_rate": 9.17692726103941e-06, "loss": 0.1371, "step": 3340 }, { "entropy": 8.919683456420898, "epoch": 0.33023531738184697, "mean_token_accuracy": 0.7669584155082703, "num_tokens": 18357803.0, "step": 3340, "train/ce_loss": 0.7381523251533508 }, { "epoch": 0.33023531738184697, "step": 3340, "train/sim_loss": 0.08203125 }, { "epoch": 0.33023531738184697, "step": 3340, "train/total_loss": 0.1558464765548706 }, { "entropy": 8.718966484069824, "epoch": 0.33033419023136246, "mean_token_accuracy": 0.7060439586639404, "num_tokens": 18363508.0, "step": 3341, "train/ce_loss": 0.5067249536514282 }, { "epoch": 0.33033419023136246, "step": 3341, "train/sim_loss": 0.078125 }, { "epoch": 0.33033419023136246, "step": 3341, "train/total_loss": 0.1287975013256073 }, { "entropy": 9.097214698791504, "epoch": 0.330433063080878, "mean_token_accuracy": 0.8537682890892029, "num_tokens": 18368952.0, "step": 3342, "train/ce_loss": 0.3833322525024414 }, { "epoch": 0.330433063080878, "step": 3342, "train/sim_loss": 0.0234375 }, { "epoch": 0.330433063080878, "step": 3342, "train/total_loss": 0.0617707259953022 }, { "entropy": 8.80815601348877, "epoch": 0.33053193593039354, "mean_token_accuracy": 0.7436762452125549, "num_tokens": 18374672.0, "step": 3343, "train/ce_loss": 1.6839814186096191 }, { "epoch": 0.33053193593039354, "step": 3343, "train/sim_loss": 0.0625 }, { "epoch": 0.33053193593039354, "step": 3343, "train/total_loss": 0.23089814186096191 }, { "entropy": 9.405206680297852, "epoch": 0.330630808779909, "mean_token_accuracy": 0.7774607539176941, "num_tokens": 18380006.0, "step": 3344, "train/ce_loss": 0.5376268029212952 }, { "epoch": 0.330630808779909, "step": 3344, "train/sim_loss": 0.015625 }, { "epoch": 0.330630808779909, "step": 3344, "train/total_loss": 0.06938768178224564 }, { "entropy": 9.093528747558594, "epoch": 0.33072968162942457, "mean_token_accuracy": 0.7627118825912476, "num_tokens": 18385543.0, "step": 3345, "train/ce_loss": 0.8665975332260132 }, { "epoch": 0.33072968162942457, "step": 3345, "train/sim_loss": 0.0703125 }, { "epoch": 0.33072968162942457, "step": 3345, "train/total_loss": 0.1569722592830658 }, { "entropy": 9.10706901550293, "epoch": 0.3308285544789401, "mean_token_accuracy": 0.663484513759613, "num_tokens": 18390871.0, "step": 3346, "train/ce_loss": 1.4762009382247925 }, { "epoch": 0.3308285544789401, "step": 3346, "train/sim_loss": 0.1171875 }, { "epoch": 0.3308285544789401, "step": 3346, "train/total_loss": 0.2648075819015503 }, { "entropy": 9.04255485534668, "epoch": 0.3309274273284556, "mean_token_accuracy": 0.7052752375602722, "num_tokens": 18396356.0, "step": 3347, "train/ce_loss": 0.9097988605499268 }, { "epoch": 0.3309274273284556, "step": 3347, "train/sim_loss": 0.0859375 }, { "epoch": 0.3309274273284556, "step": 3347, "train/total_loss": 0.17691738903522491 }, { "entropy": 8.859208106994629, "epoch": 0.33102630017797113, "mean_token_accuracy": 0.7261146306991577, "num_tokens": 18401919.0, "step": 3348, "train/ce_loss": 1.1240581274032593 }, { "epoch": 0.33102630017797113, "step": 3348, "train/sim_loss": 0.08203125 }, { "epoch": 0.33102630017797113, "step": 3348, "train/total_loss": 0.19443705677986145 }, { "entropy": 8.62106704711914, "epoch": 0.3311251730274867, "mean_token_accuracy": 0.744870662689209, "num_tokens": 18407613.0, "step": 3349, "train/ce_loss": 2.23869252204895 }, { "epoch": 0.3311251730274867, "step": 3349, "train/sim_loss": 0.05078125 }, { "epoch": 0.3311251730274867, "step": 3349, "train/total_loss": 0.274650514125824 }, { "entropy": 8.657695770263672, "epoch": 0.33122404587700216, "mean_token_accuracy": 0.7579618096351624, "num_tokens": 18413219.0, "step": 3350, "train/ce_loss": 0.6351818442344666 }, { "epoch": 0.33122404587700216, "step": 3350, "train/sim_loss": 0.0625 }, { "epoch": 0.33122404587700216, "step": 3350, "train/total_loss": 0.1260181963443756 }, { "entropy": 8.343402862548828, "epoch": 0.3313229187265177, "mean_token_accuracy": 0.7057554125785828, "num_tokens": 18419209.0, "step": 3351, "train/ce_loss": 0.6810896992683411 }, { "epoch": 0.3313229187265177, "step": 3351, "train/sim_loss": 0.1953125 }, { "epoch": 0.3313229187265177, "step": 3351, "train/total_loss": 0.2634214758872986 }, { "entropy": 8.982009887695312, "epoch": 0.33142179157603324, "mean_token_accuracy": 0.7820512652397156, "num_tokens": 18424774.0, "step": 3352, "train/ce_loss": 0.6501287221908569 }, { "epoch": 0.33142179157603324, "step": 3352, "train/sim_loss": 0.0703125 }, { "epoch": 0.33142179157603324, "step": 3352, "train/total_loss": 0.1353253722190857 }, { "entropy": 9.350072860717773, "epoch": 0.33152066442554873, "mean_token_accuracy": 0.7331536412239075, "num_tokens": 18430136.0, "step": 3353, "train/ce_loss": 0.8989262580871582 }, { "epoch": 0.33152066442554873, "step": 3353, "train/sim_loss": 0.046875 }, { "epoch": 0.33152066442554873, "step": 3353, "train/total_loss": 0.13676762580871582 }, { "entropy": 9.000933647155762, "epoch": 0.33161953727506427, "mean_token_accuracy": 0.7591623067855835, "num_tokens": 18435632.0, "step": 3354, "train/ce_loss": 0.6267368793487549 }, { "epoch": 0.33161953727506427, "step": 3354, "train/sim_loss": 0.0234375 }, { "epoch": 0.33161953727506427, "step": 3354, "train/total_loss": 0.08611118793487549 }, { "entropy": 8.909933090209961, "epoch": 0.3317184101245798, "mean_token_accuracy": 0.7263843417167664, "num_tokens": 18441194.0, "step": 3355, "train/ce_loss": 0.8958181142807007 }, { "epoch": 0.3317184101245798, "step": 3355, "train/sim_loss": 0.0625 }, { "epoch": 0.3317184101245798, "step": 3355, "train/total_loss": 0.15208181738853455 }, { "entropy": 9.095330238342285, "epoch": 0.3318172829740953, "mean_token_accuracy": 0.6760563254356384, "num_tokens": 18446672.0, "step": 3356, "train/ce_loss": 1.2084461450576782 }, { "epoch": 0.3318172829740953, "step": 3356, "train/sim_loss": 0.11328125 }, { "epoch": 0.3318172829740953, "step": 3356, "train/total_loss": 0.23412586748600006 }, { "entropy": 8.93388557434082, "epoch": 0.33191615582361084, "mean_token_accuracy": 0.7923322916030884, "num_tokens": 18452173.0, "step": 3357, "train/ce_loss": 0.4659091532230377 }, { "epoch": 0.33191615582361084, "step": 3357, "train/sim_loss": 0.0234375 }, { "epoch": 0.33191615582361084, "step": 3357, "train/total_loss": 0.07002841681241989 }, { "entropy": 8.665193557739258, "epoch": 0.3320150286731264, "mean_token_accuracy": 0.7616438269615173, "num_tokens": 18457944.0, "step": 3358, "train/ce_loss": 0.8183895945549011 }, { "epoch": 0.3320150286731264, "step": 3358, "train/sim_loss": 0.0546875 }, { "epoch": 0.3320150286731264, "step": 3358, "train/total_loss": 0.1365264654159546 }, { "entropy": 8.497122764587402, "epoch": 0.33211390152264186, "mean_token_accuracy": 0.7773475050926208, "num_tokens": 18463669.0, "step": 3359, "train/ce_loss": 0.41876593232154846 }, { "epoch": 0.33211390152264186, "step": 3359, "train/sim_loss": 0.0703125 }, { "epoch": 0.33211390152264186, "step": 3359, "train/total_loss": 0.11218909919261932 }, { "epoch": 0.3322127743721574, "grad_norm": 0.7501981854438782, "learning_rate": 9.171982396281463e-06, "loss": 0.1444, "step": 3360 }, { "entropy": 9.059199333190918, "epoch": 0.3322127743721574, "mean_token_accuracy": 0.739294707775116, "num_tokens": 18469047.0, "step": 3360, "train/ce_loss": 1.2881019115447998 }, { "epoch": 0.3322127743721574, "step": 3360, "train/sim_loss": 0.0546875 }, { "epoch": 0.3322127743721574, "step": 3360, "train/total_loss": 0.18349769711494446 }, { "entropy": 9.23812484741211, "epoch": 0.33231164722167295, "mean_token_accuracy": 0.7698519229888916, "num_tokens": 18474358.0, "step": 3361, "train/ce_loss": 1.2250069379806519 }, { "epoch": 0.33231164722167295, "step": 3361, "train/sim_loss": 0.05078125 }, { "epoch": 0.33231164722167295, "step": 3361, "train/total_loss": 0.1732819378376007 }, { "entropy": 9.14293384552002, "epoch": 0.33241052007118843, "mean_token_accuracy": 0.7312775254249573, "num_tokens": 18479651.0, "step": 3362, "train/ce_loss": 0.7257992625236511 }, { "epoch": 0.33241052007118843, "step": 3362, "train/sim_loss": 0.02734375 }, { "epoch": 0.33241052007118843, "step": 3362, "train/total_loss": 0.09992367774248123 }, { "entropy": 9.163899421691895, "epoch": 0.33250939292070397, "mean_token_accuracy": 0.7641975283622742, "num_tokens": 18485041.0, "step": 3363, "train/ce_loss": 0.6538408398628235 }, { "epoch": 0.33250939292070397, "step": 3363, "train/sim_loss": 0.01953125 }, { "epoch": 0.33250939292070397, "step": 3363, "train/total_loss": 0.08491533249616623 }, { "entropy": 9.087060928344727, "epoch": 0.3326082657702195, "mean_token_accuracy": 0.7549669146537781, "num_tokens": 18490597.0, "step": 3364, "train/ce_loss": 0.7900916337966919 }, { "epoch": 0.3326082657702195, "step": 3364, "train/sim_loss": 0.06640625 }, { "epoch": 0.3326082657702195, "step": 3364, "train/total_loss": 0.14541542530059814 }, { "entropy": 9.108280181884766, "epoch": 0.332707138619735, "mean_token_accuracy": 0.7584951519966125, "num_tokens": 18496043.0, "step": 3365, "train/ce_loss": 0.4278091788291931 }, { "epoch": 0.332707138619735, "step": 3365, "train/sim_loss": 0.0234375 }, { "epoch": 0.332707138619735, "step": 3365, "train/total_loss": 0.06621842086315155 }, { "entropy": 9.352507591247559, "epoch": 0.33280601146925054, "mean_token_accuracy": 0.7618403434753418, "num_tokens": 18501350.0, "step": 3366, "train/ce_loss": 0.5851004123687744 }, { "epoch": 0.33280601146925054, "step": 3366, "train/sim_loss": 0.0234375 }, { "epoch": 0.33280601146925054, "step": 3366, "train/total_loss": 0.08194754272699356 }, { "entropy": 8.591299057006836, "epoch": 0.3329048843187661, "mean_token_accuracy": 0.7633851170539856, "num_tokens": 18507157.0, "step": 3367, "train/ce_loss": 0.6820550560951233 }, { "epoch": 0.3329048843187661, "step": 3367, "train/sim_loss": 0.0234375 }, { "epoch": 0.3329048843187661, "step": 3367, "train/total_loss": 0.09164300560951233 }, { "entropy": 9.086897850036621, "epoch": 0.33300375716828157, "mean_token_accuracy": 0.7763158082962036, "num_tokens": 18512530.0, "step": 3368, "train/ce_loss": 0.9694409966468811 }, { "epoch": 0.33300375716828157, "step": 3368, "train/sim_loss": 0.0625 }, { "epoch": 0.33300375716828157, "step": 3368, "train/total_loss": 0.15944409370422363 }, { "entropy": 8.80565071105957, "epoch": 0.3331026300177971, "mean_token_accuracy": 0.76450115442276, "num_tokens": 18518021.0, "step": 3369, "train/ce_loss": 1.0303025245666504 }, { "epoch": 0.3331026300177971, "step": 3369, "train/sim_loss": 0.09375 }, { "epoch": 0.3331026300177971, "step": 3369, "train/total_loss": 0.196780264377594 }, { "entropy": 9.322933197021484, "epoch": 0.33320150286731265, "mean_token_accuracy": 0.780802309513092, "num_tokens": 18523241.0, "step": 3370, "train/ce_loss": 0.8955622315406799 }, { "epoch": 0.33320150286731265, "step": 3370, "train/sim_loss": 0.0859375 }, { "epoch": 0.33320150286731265, "step": 3370, "train/total_loss": 0.17549371719360352 }, { "entropy": 9.332192420959473, "epoch": 0.33330037571682813, "mean_token_accuracy": 0.7747489213943481, "num_tokens": 18528504.0, "step": 3371, "train/ce_loss": 0.8732729554176331 }, { "epoch": 0.33330037571682813, "step": 3371, "train/sim_loss": 0.046875 }, { "epoch": 0.33330037571682813, "step": 3371, "train/total_loss": 0.13420230150222778 }, { "entropy": 9.071432113647461, "epoch": 0.3333992485663437, "mean_token_accuracy": 0.7429245114326477, "num_tokens": 18533910.0, "step": 3372, "train/ce_loss": 1.647017478942871 }, { "epoch": 0.3333992485663437, "step": 3372, "train/sim_loss": 0.09375 }, { "epoch": 0.3333992485663437, "step": 3372, "train/total_loss": 0.25845175981521606 }, { "entropy": 9.453814506530762, "epoch": 0.3334981214158592, "mean_token_accuracy": 0.7267950773239136, "num_tokens": 18539089.0, "step": 3373, "train/ce_loss": 1.3078104257583618 }, { "epoch": 0.3334981214158592, "step": 3373, "train/sim_loss": 0.0390625 }, { "epoch": 0.3334981214158592, "step": 3373, "train/total_loss": 0.16984353959560394 }, { "entropy": 8.746170043945312, "epoch": 0.3335969942653747, "mean_token_accuracy": 0.7574421167373657, "num_tokens": 18544611.0, "step": 3374, "train/ce_loss": 0.6829934120178223 }, { "epoch": 0.3335969942653747, "step": 3374, "train/sim_loss": 0.046875 }, { "epoch": 0.3335969942653747, "step": 3374, "train/total_loss": 0.11517434567213058 }, { "entropy": 8.795426368713379, "epoch": 0.33369586711489024, "mean_token_accuracy": 0.7514231204986572, "num_tokens": 18550319.0, "step": 3375, "train/ce_loss": 0.9700766205787659 }, { "epoch": 0.33369586711489024, "step": 3375, "train/sim_loss": 0.0703125 }, { "epoch": 0.33369586711489024, "step": 3375, "train/total_loss": 0.1673201620578766 }, { "entropy": 8.689640045166016, "epoch": 0.3337947399644058, "mean_token_accuracy": 0.7888655662536621, "num_tokens": 18555875.0, "step": 3376, "train/ce_loss": 0.4641062617301941 }, { "epoch": 0.3337947399644058, "step": 3376, "train/sim_loss": 0.06640625 }, { "epoch": 0.3337947399644058, "step": 3376, "train/total_loss": 0.11281687766313553 }, { "entropy": 8.95899486541748, "epoch": 0.33389361281392127, "mean_token_accuracy": 0.7827076315879822, "num_tokens": 18561396.0, "step": 3377, "train/ce_loss": 0.5526520609855652 }, { "epoch": 0.33389361281392127, "step": 3377, "train/sim_loss": 0.0703125 }, { "epoch": 0.33389361281392127, "step": 3377, "train/total_loss": 0.12557770311832428 }, { "entropy": 9.269828796386719, "epoch": 0.3339924856634368, "mean_token_accuracy": 0.7818740606307983, "num_tokens": 18566616.0, "step": 3378, "train/ce_loss": 0.656091570854187 }, { "epoch": 0.3339924856634368, "step": 3378, "train/sim_loss": 0.046875 }, { "epoch": 0.3339924856634368, "step": 3378, "train/total_loss": 0.1124841570854187 }, { "entropy": 9.07741641998291, "epoch": 0.33409135851295235, "mean_token_accuracy": 0.7734806537628174, "num_tokens": 18572094.0, "step": 3379, "train/ce_loss": 0.9331567287445068 }, { "epoch": 0.33409135851295235, "step": 3379, "train/sim_loss": 0.0625 }, { "epoch": 0.33409135851295235, "step": 3379, "train/total_loss": 0.15581567585468292 }, { "epoch": 0.3341902313624679, "grad_norm": 0.706942081451416, "learning_rate": 9.167037531523513e-06, "loss": 0.1311, "step": 3380 }, { "entropy": 9.050409317016602, "epoch": 0.3341902313624679, "mean_token_accuracy": 0.7508690357208252, "num_tokens": 18577568.0, "step": 3380, "train/ce_loss": 0.5850479006767273 }, { "epoch": 0.3341902313624679, "step": 3380, "train/sim_loss": 0.03125 }, { "epoch": 0.3341902313624679, "step": 3380, "train/total_loss": 0.08975479006767273 }, { "entropy": 8.769598960876465, "epoch": 0.3342891042119834, "mean_token_accuracy": 0.720108687877655, "num_tokens": 18583209.0, "step": 3381, "train/ce_loss": 0.7118861675262451 }, { "epoch": 0.3342891042119834, "step": 3381, "train/sim_loss": 0.046875 }, { "epoch": 0.3342891042119834, "step": 3381, "train/total_loss": 0.11806362122297287 }, { "entropy": 9.195652961730957, "epoch": 0.3343879770614989, "mean_token_accuracy": 0.7486486434936523, "num_tokens": 18588577.0, "step": 3382, "train/ce_loss": 0.4577292799949646 }, { "epoch": 0.3343879770614989, "step": 3382, "train/sim_loss": 0.046875 }, { "epoch": 0.3343879770614989, "step": 3382, "train/total_loss": 0.09264792501926422 }, { "entropy": 9.1483736038208, "epoch": 0.33448684991101446, "mean_token_accuracy": 0.7923362255096436, "num_tokens": 18593961.0, "step": 3383, "train/ce_loss": 0.5180718898773193 }, { "epoch": 0.33448684991101446, "step": 3383, "train/sim_loss": 0.06640625 }, { "epoch": 0.33448684991101446, "step": 3383, "train/total_loss": 0.11821344494819641 }, { "entropy": 8.967496871948242, "epoch": 0.33458572276052995, "mean_token_accuracy": 0.8033826351165771, "num_tokens": 18599438.0, "step": 3384, "train/ce_loss": 0.6654918789863586 }, { "epoch": 0.33458572276052995, "step": 3384, "train/sim_loss": 0.046875 }, { "epoch": 0.33458572276052995, "step": 3384, "train/total_loss": 0.11342418938875198 }, { "entropy": 8.828765869140625, "epoch": 0.3346845956100455, "mean_token_accuracy": 0.8460744023323059, "num_tokens": 18605094.0, "step": 3385, "train/ce_loss": 0.5964673161506653 }, { "epoch": 0.3346845956100455, "step": 3385, "train/sim_loss": 0.02734375 }, { "epoch": 0.3346845956100455, "step": 3385, "train/total_loss": 0.08699048310518265 }, { "entropy": 8.98710823059082, "epoch": 0.33478346845956103, "mean_token_accuracy": 0.704402506351471, "num_tokens": 18610611.0, "step": 3386, "train/ce_loss": 0.9654845595359802 }, { "epoch": 0.33478346845956103, "step": 3386, "train/sim_loss": 0.1015625 }, { "epoch": 0.33478346845956103, "step": 3386, "train/total_loss": 0.19811096787452698 }, { "entropy": 8.86330509185791, "epoch": 0.3348823413090765, "mean_token_accuracy": 0.7366120219230652, "num_tokens": 18616062.0, "step": 3387, "train/ce_loss": 0.6297827363014221 }, { "epoch": 0.3348823413090765, "step": 3387, "train/sim_loss": 0.05078125 }, { "epoch": 0.3348823413090765, "step": 3387, "train/total_loss": 0.11375952512025833 }, { "entropy": 9.094345092773438, "epoch": 0.33498121415859206, "mean_token_accuracy": 0.703529417514801, "num_tokens": 18621543.0, "step": 3388, "train/ce_loss": 0.730383574962616 }, { "epoch": 0.33498121415859206, "step": 3388, "train/sim_loss": 0.08203125 }, { "epoch": 0.33498121415859206, "step": 3388, "train/total_loss": 0.15506961941719055 }, { "entropy": 9.091678619384766, "epoch": 0.3350800870081076, "mean_token_accuracy": 0.7801339030265808, "num_tokens": 18627078.0, "step": 3389, "train/ce_loss": 0.4360784590244293 }, { "epoch": 0.3350800870081076, "step": 3389, "train/sim_loss": 0.0234375 }, { "epoch": 0.3350800870081076, "step": 3389, "train/total_loss": 0.06704534590244293 }, { "entropy": 9.072471618652344, "epoch": 0.3351789598576231, "mean_token_accuracy": 0.7403846383094788, "num_tokens": 18632571.0, "step": 3390, "train/ce_loss": 0.6340453624725342 }, { "epoch": 0.3351789598576231, "step": 3390, "train/sim_loss": 0.1015625 }, { "epoch": 0.3351789598576231, "step": 3390, "train/total_loss": 0.16496703028678894 }, { "entropy": 9.282722473144531, "epoch": 0.3352778327071386, "mean_token_accuracy": 0.6961178183555603, "num_tokens": 18637970.0, "step": 3391, "train/ce_loss": 0.9272945523262024 }, { "epoch": 0.3352778327071386, "step": 3391, "train/sim_loss": 0.1328125 }, { "epoch": 0.3352778327071386, "step": 3391, "train/total_loss": 0.22554194927215576 }, { "entropy": 9.403739929199219, "epoch": 0.33537670555665416, "mean_token_accuracy": 0.7972789406776428, "num_tokens": 18643193.0, "step": 3392, "train/ce_loss": 1.0603013038635254 }, { "epoch": 0.33537670555665416, "step": 3392, "train/sim_loss": 0.0546875 }, { "epoch": 0.33537670555665416, "step": 3392, "train/total_loss": 0.16071763634681702 }, { "entropy": 8.733884811401367, "epoch": 0.33547557840616965, "mean_token_accuracy": 0.7379077672958374, "num_tokens": 18648614.0, "step": 3393, "train/ce_loss": 0.5850088596343994 }, { "epoch": 0.33547557840616965, "step": 3393, "train/sim_loss": 0.07421875 }, { "epoch": 0.33547557840616965, "step": 3393, "train/total_loss": 0.13271963596343994 }, { "entropy": 8.784040451049805, "epoch": 0.3355744512556852, "mean_token_accuracy": 0.7703620195388794, "num_tokens": 18654150.0, "step": 3394, "train/ce_loss": 0.4400407373905182 }, { "epoch": 0.3355744512556852, "step": 3394, "train/sim_loss": 0.05859375 }, { "epoch": 0.3355744512556852, "step": 3394, "train/total_loss": 0.10259782522916794 }, { "entropy": 8.950641632080078, "epoch": 0.33567332410520073, "mean_token_accuracy": 0.7533186078071594, "num_tokens": 18659707.0, "step": 3395, "train/ce_loss": 0.7120389938354492 }, { "epoch": 0.33567332410520073, "step": 3395, "train/sim_loss": 0.05078125 }, { "epoch": 0.33567332410520073, "step": 3395, "train/total_loss": 0.12198515236377716 }, { "entropy": 8.847721099853516, "epoch": 0.3357721969547162, "mean_token_accuracy": 0.737463116645813, "num_tokens": 18665300.0, "step": 3396, "train/ce_loss": 1.3146852254867554 }, { "epoch": 0.3357721969547162, "step": 3396, "train/sim_loss": 0.0859375 }, { "epoch": 0.3357721969547162, "step": 3396, "train/total_loss": 0.2174060195684433 }, { "entropy": 9.018057823181152, "epoch": 0.33587106980423176, "mean_token_accuracy": 0.7657563090324402, "num_tokens": 18670825.0, "step": 3397, "train/ce_loss": 0.8091156482696533 }, { "epoch": 0.33587106980423176, "step": 3397, "train/sim_loss": 0.109375 }, { "epoch": 0.33587106980423176, "step": 3397, "train/total_loss": 0.1902865767478943 }, { "entropy": 9.297983169555664, "epoch": 0.3359699426537473, "mean_token_accuracy": 0.79651939868927, "num_tokens": 18676208.0, "step": 3398, "train/ce_loss": 0.4578360617160797 }, { "epoch": 0.3359699426537473, "step": 3398, "train/sim_loss": 0.03125 }, { "epoch": 0.3359699426537473, "step": 3398, "train/total_loss": 0.07703360915184021 }, { "entropy": 8.954853057861328, "epoch": 0.3360688155032628, "mean_token_accuracy": 0.767241358757019, "num_tokens": 18681787.0, "step": 3399, "train/ce_loss": 0.5627641081809998 }, { "epoch": 0.3360688155032628, "step": 3399, "train/sim_loss": 0.078125 }, { "epoch": 0.3360688155032628, "step": 3399, "train/total_loss": 0.13440141081809998 }, { "epoch": 0.3361676883527783, "grad_norm": 0.7006328701972961, "learning_rate": 9.162092666765566e-06, "loss": 0.1355, "step": 3400 }, { "entropy": 9.1798095703125, "epoch": 0.3361676883527783, "mean_token_accuracy": 0.7683073282241821, "num_tokens": 18687193.0, "step": 3400, "train/ce_loss": 0.7913243770599365 }, { "epoch": 0.3361676883527783, "step": 3400, "train/sim_loss": 0.05078125 }, { "epoch": 0.3361676883527783, "step": 3400, "train/total_loss": 0.12991368770599365 }, { "entropy": 9.226898193359375, "epoch": 0.33626656120229387, "mean_token_accuracy": 0.7276940941810608, "num_tokens": 18692625.0, "step": 3401, "train/ce_loss": 1.3303859233856201 }, { "epoch": 0.33626656120229387, "step": 3401, "train/sim_loss": 0.0859375 }, { "epoch": 0.33626656120229387, "step": 3401, "train/total_loss": 0.21897609531879425 }, { "entropy": 9.069173812866211, "epoch": 0.33636543405180935, "mean_token_accuracy": 0.7208706736564636, "num_tokens": 18698076.0, "step": 3402, "train/ce_loss": 1.3136374950408936 }, { "epoch": 0.33636543405180935, "step": 3402, "train/sim_loss": 0.06640625 }, { "epoch": 0.33636543405180935, "step": 3402, "train/total_loss": 0.19776999950408936 }, { "entropy": 9.214449882507324, "epoch": 0.3364643069013249, "mean_token_accuracy": 0.7424836754798889, "num_tokens": 18703515.0, "step": 3403, "train/ce_loss": 0.7999873757362366 }, { "epoch": 0.3364643069013249, "step": 3403, "train/sim_loss": 0.0703125 }, { "epoch": 0.3364643069013249, "step": 3403, "train/total_loss": 0.15031123161315918 }, { "entropy": 8.89826774597168, "epoch": 0.33656317975084044, "mean_token_accuracy": 0.7319474816322327, "num_tokens": 18709087.0, "step": 3404, "train/ce_loss": 0.9434422850608826 }, { "epoch": 0.33656317975084044, "step": 3404, "train/sim_loss": 0.11328125 }, { "epoch": 0.33656317975084044, "step": 3404, "train/total_loss": 0.20762547850608826 }, { "entropy": 9.141641616821289, "epoch": 0.3366620526003559, "mean_token_accuracy": 0.7348377704620361, "num_tokens": 18714352.0, "step": 3405, "train/ce_loss": 0.9298139810562134 }, { "epoch": 0.3366620526003559, "step": 3405, "train/sim_loss": 0.078125 }, { "epoch": 0.3366620526003559, "step": 3405, "train/total_loss": 0.17110639810562134 }, { "entropy": 9.130280494689941, "epoch": 0.33676092544987146, "mean_token_accuracy": 0.7022398114204407, "num_tokens": 18719743.0, "step": 3406, "train/ce_loss": 1.2450209856033325 }, { "epoch": 0.33676092544987146, "step": 3406, "train/sim_loss": 0.0546875 }, { "epoch": 0.33676092544987146, "step": 3406, "train/total_loss": 0.17918959259986877 }, { "entropy": 8.957133293151855, "epoch": 0.336859798299387, "mean_token_accuracy": 0.7918660044670105, "num_tokens": 18725155.0, "step": 3407, "train/ce_loss": 0.584973156452179 }, { "epoch": 0.336859798299387, "step": 3407, "train/sim_loss": 0.08984375 }, { "epoch": 0.336859798299387, "step": 3407, "train/total_loss": 0.14834105968475342 }, { "entropy": 9.071895599365234, "epoch": 0.3369586711489025, "mean_token_accuracy": 0.7995283007621765, "num_tokens": 18730660.0, "step": 3408, "train/ce_loss": 0.45891037583351135 }, { "epoch": 0.3369586711489025, "step": 3408, "train/sim_loss": 0.05859375 }, { "epoch": 0.3369586711489025, "step": 3408, "train/total_loss": 0.10448478907346725 }, { "entropy": 9.169526100158691, "epoch": 0.33705754399841803, "mean_token_accuracy": 0.774545431137085, "num_tokens": 18736084.0, "step": 3409, "train/ce_loss": 0.8945693969726562 }, { "epoch": 0.33705754399841803, "step": 3409, "train/sim_loss": 0.07421875 }, { "epoch": 0.33705754399841803, "step": 3409, "train/total_loss": 0.1636756956577301 }, { "entropy": 8.90278434753418, "epoch": 0.33715641684793357, "mean_token_accuracy": 0.699999988079071, "num_tokens": 18741588.0, "step": 3410, "train/ce_loss": 0.5722022652626038 }, { "epoch": 0.33715641684793357, "step": 3410, "train/sim_loss": 0.09375 }, { "epoch": 0.33715641684793357, "step": 3410, "train/total_loss": 0.1509702205657959 }, { "entropy": 8.910388946533203, "epoch": 0.33725528969744906, "mean_token_accuracy": 0.7255343198776245, "num_tokens": 18746982.0, "step": 3411, "train/ce_loss": 0.881068766117096 }, { "epoch": 0.33725528969744906, "step": 3411, "train/sim_loss": 0.1328125 }, { "epoch": 0.33725528969744906, "step": 3411, "train/total_loss": 0.22091937065124512 }, { "entropy": 9.38779067993164, "epoch": 0.3373541625469646, "mean_token_accuracy": 0.7993921041488647, "num_tokens": 18752299.0, "step": 3412, "train/ce_loss": 0.8114800453186035 }, { "epoch": 0.3373541625469646, "step": 3412, "train/sim_loss": 0.078125 }, { "epoch": 0.3373541625469646, "step": 3412, "train/total_loss": 0.15927299857139587 }, { "entropy": 8.949864387512207, "epoch": 0.33745303539648014, "mean_token_accuracy": 0.7337733507156372, "num_tokens": 18757825.0, "step": 3413, "train/ce_loss": 0.5011051893234253 }, { "epoch": 0.33745303539648014, "step": 3413, "train/sim_loss": 0.0234375 }, { "epoch": 0.33745303539648014, "step": 3413, "train/total_loss": 0.07354801893234253 }, { "entropy": 9.041603088378906, "epoch": 0.3375519082459956, "mean_token_accuracy": 0.7665036916732788, "num_tokens": 18763268.0, "step": 3414, "train/ce_loss": 0.9834603667259216 }, { "epoch": 0.3375519082459956, "step": 3414, "train/sim_loss": 0.1015625 }, { "epoch": 0.3375519082459956, "step": 3414, "train/total_loss": 0.1999085396528244 }, { "entropy": 8.851835250854492, "epoch": 0.33765078109551117, "mean_token_accuracy": 0.7308065891265869, "num_tokens": 18768977.0, "step": 3415, "train/ce_loss": 1.1188194751739502 }, { "epoch": 0.33765078109551117, "step": 3415, "train/sim_loss": 0.07421875 }, { "epoch": 0.33765078109551117, "step": 3415, "train/total_loss": 0.18610069155693054 }, { "entropy": 9.311737060546875, "epoch": 0.3377496539450267, "mean_token_accuracy": 0.7328145503997803, "num_tokens": 18774387.0, "step": 3416, "train/ce_loss": 1.4260293245315552 }, { "epoch": 0.3377496539450267, "step": 3416, "train/sim_loss": 0.1015625 }, { "epoch": 0.3377496539450267, "step": 3416, "train/total_loss": 0.24416543543338776 }, { "entropy": 8.866942405700684, "epoch": 0.3378485267945422, "mean_token_accuracy": 0.7476828098297119, "num_tokens": 18779948.0, "step": 3417, "train/ce_loss": 0.615195631980896 }, { "epoch": 0.3378485267945422, "step": 3417, "train/sim_loss": 0.0234375 }, { "epoch": 0.3378485267945422, "step": 3417, "train/total_loss": 0.0849570631980896 }, { "entropy": 9.18349838256836, "epoch": 0.33794739964405773, "mean_token_accuracy": 0.7805178761482239, "num_tokens": 18785381.0, "step": 3418, "train/ce_loss": 0.3667494058609009 }, { "epoch": 0.33794739964405773, "step": 3418, "train/sim_loss": 0.02734375 }, { "epoch": 0.33794739964405773, "step": 3418, "train/total_loss": 0.06401869654655457 }, { "entropy": 9.18016242980957, "epoch": 0.3380462724935733, "mean_token_accuracy": 0.7663421630859375, "num_tokens": 18790740.0, "step": 3419, "train/ce_loss": 0.7382562756538391 }, { "epoch": 0.3380462724935733, "step": 3419, "train/sim_loss": 0.05078125 }, { "epoch": 0.3380462724935733, "step": 3419, "train/total_loss": 0.12460687756538391 }, { "epoch": 0.3381451453430888, "grad_norm": 0.8937796354293823, "learning_rate": 9.157147802007616e-06, "loss": 0.1477, "step": 3420 }, { "entropy": 9.01446533203125, "epoch": 0.3381451453430888, "mean_token_accuracy": 0.7877813577651978, "num_tokens": 18796307.0, "step": 3420, "train/ce_loss": 0.28852227330207825 }, { "epoch": 0.3381451453430888, "step": 3420, "train/sim_loss": 0.06640625 }, { "epoch": 0.3381451453430888, "step": 3420, "train/total_loss": 0.09525847434997559 }, { "entropy": 8.929280281066895, "epoch": 0.3382440181926043, "mean_token_accuracy": 0.7640807628631592, "num_tokens": 18801924.0, "step": 3421, "train/ce_loss": 0.7963653802871704 }, { "epoch": 0.3382440181926043, "step": 3421, "train/sim_loss": 0.078125 }, { "epoch": 0.3382440181926043, "step": 3421, "train/total_loss": 0.15776154398918152 }, { "entropy": 9.04946517944336, "epoch": 0.33834289104211984, "mean_token_accuracy": 0.7488425970077515, "num_tokens": 18807394.0, "step": 3422, "train/ce_loss": 1.0314621925354004 }, { "epoch": 0.33834289104211984, "step": 3422, "train/sim_loss": 0.046875 }, { "epoch": 0.33834289104211984, "step": 3422, "train/total_loss": 0.15002122521400452 }, { "entropy": 9.30472183227539, "epoch": 0.3384417638916354, "mean_token_accuracy": 0.7438867688179016, "num_tokens": 18812770.0, "step": 3423, "train/ce_loss": 0.92303466796875 }, { "epoch": 0.3384417638916354, "step": 3423, "train/sim_loss": 0.0703125 }, { "epoch": 0.3384417638916354, "step": 3423, "train/total_loss": 0.16261596977710724 }, { "entropy": 9.082595825195312, "epoch": 0.33854063674115087, "mean_token_accuracy": 0.7819277048110962, "num_tokens": 18818250.0, "step": 3424, "train/ce_loss": 0.8292279243469238 }, { "epoch": 0.33854063674115087, "step": 3424, "train/sim_loss": 0.1015625 }, { "epoch": 0.33854063674115087, "step": 3424, "train/total_loss": 0.1844852864742279 }, { "entropy": 8.845739364624023, "epoch": 0.3386395095906664, "mean_token_accuracy": 0.7184557318687439, "num_tokens": 18823906.0, "step": 3425, "train/ce_loss": 0.6152939200401306 }, { "epoch": 0.3386395095906664, "step": 3425, "train/sim_loss": 0.08984375 }, { "epoch": 0.3386395095906664, "step": 3425, "train/total_loss": 0.15137314796447754 }, { "entropy": 8.903855323791504, "epoch": 0.33873838244018195, "mean_token_accuracy": 0.7274401187896729, "num_tokens": 18829563.0, "step": 3426, "train/ce_loss": 0.4410121738910675 }, { "epoch": 0.33873838244018195, "step": 3426, "train/sim_loss": 0.05078125 }, { "epoch": 0.33873838244018195, "step": 3426, "train/total_loss": 0.09488247334957123 }, { "entropy": 9.006560325622559, "epoch": 0.33883725528969744, "mean_token_accuracy": 0.7525539398193359, "num_tokens": 18835009.0, "step": 3427, "train/ce_loss": 0.3492516875267029 }, { "epoch": 0.33883725528969744, "step": 3427, "train/sim_loss": 0.03515625 }, { "epoch": 0.33883725528969744, "step": 3427, "train/total_loss": 0.07008142024278641 }, { "entropy": 9.309053421020508, "epoch": 0.338936128139213, "mean_token_accuracy": 0.7931488752365112, "num_tokens": 18840380.0, "step": 3428, "train/ce_loss": 0.5257266759872437 }, { "epoch": 0.338936128139213, "step": 3428, "train/sim_loss": 0.07421875 }, { "epoch": 0.338936128139213, "step": 3428, "train/total_loss": 0.12679141759872437 }, { "entropy": 8.769979476928711, "epoch": 0.3390350009887285, "mean_token_accuracy": 0.7220573425292969, "num_tokens": 18845976.0, "step": 3429, "train/ce_loss": 0.4966198801994324 }, { "epoch": 0.3390350009887285, "step": 3429, "train/sim_loss": 0.0390625 }, { "epoch": 0.3390350009887285, "step": 3429, "train/total_loss": 0.08872449398040771 }, { "entropy": 9.021951675415039, "epoch": 0.339133873838244, "mean_token_accuracy": 0.7599999904632568, "num_tokens": 18851451.0, "step": 3430, "train/ce_loss": 0.387717068195343 }, { "epoch": 0.339133873838244, "step": 3430, "train/sim_loss": 0.02734375 }, { "epoch": 0.339133873838244, "step": 3430, "train/total_loss": 0.06611545383930206 }, { "entropy": 9.009990692138672, "epoch": 0.33923274668775955, "mean_token_accuracy": 0.7054714560508728, "num_tokens": 18856875.0, "step": 3431, "train/ce_loss": 1.0437498092651367 }, { "epoch": 0.33923274668775955, "step": 3431, "train/sim_loss": 0.046875 }, { "epoch": 0.33923274668775955, "step": 3431, "train/total_loss": 0.1512499749660492 }, { "entropy": 9.200651168823242, "epoch": 0.3393316195372751, "mean_token_accuracy": 0.722841203212738, "num_tokens": 18862233.0, "step": 3432, "train/ce_loss": 0.8129413723945618 }, { "epoch": 0.3393316195372751, "step": 3432, "train/sim_loss": 0.08203125 }, { "epoch": 0.3393316195372751, "step": 3432, "train/total_loss": 0.16332539916038513 }, { "entropy": 8.964715003967285, "epoch": 0.33943049238679057, "mean_token_accuracy": 0.6902760863304138, "num_tokens": 18867730.0, "step": 3433, "train/ce_loss": 0.853792130947113 }, { "epoch": 0.33943049238679057, "step": 3433, "train/sim_loss": 0.0546875 }, { "epoch": 0.33943049238679057, "step": 3433, "train/total_loss": 0.1400667130947113 }, { "entropy": 9.344623565673828, "epoch": 0.3395293652363061, "mean_token_accuracy": 0.704023003578186, "num_tokens": 18873045.0, "step": 3434, "train/ce_loss": 1.1272016763687134 }, { "epoch": 0.3395293652363061, "step": 3434, "train/sim_loss": 0.125 }, { "epoch": 0.3395293652363061, "step": 3434, "train/total_loss": 0.23772016167640686 }, { "entropy": 8.921762466430664, "epoch": 0.33962823808582165, "mean_token_accuracy": 0.7918834686279297, "num_tokens": 18878543.0, "step": 3435, "train/ce_loss": 0.43090224266052246 }, { "epoch": 0.33962823808582165, "step": 3435, "train/sim_loss": 0.0546875 }, { "epoch": 0.33962823808582165, "step": 3435, "train/total_loss": 0.09777772426605225 }, { "entropy": 9.269332885742188, "epoch": 0.33972711093533714, "mean_token_accuracy": 0.7987729907035828, "num_tokens": 18883940.0, "step": 3436, "train/ce_loss": 0.7551689147949219 }, { "epoch": 0.33972711093533714, "step": 3436, "train/sim_loss": 0.0859375 }, { "epoch": 0.33972711093533714, "step": 3436, "train/total_loss": 0.16145439445972443 }, { "entropy": 9.113157272338867, "epoch": 0.3398259837848527, "mean_token_accuracy": 0.7259174585342407, "num_tokens": 18889401.0, "step": 3437, "train/ce_loss": 1.3499926328659058 }, { "epoch": 0.3398259837848527, "step": 3437, "train/sim_loss": 0.08203125 }, { "epoch": 0.3398259837848527, "step": 3437, "train/total_loss": 0.21703051030635834 }, { "entropy": 9.007105827331543, "epoch": 0.3399248566343682, "mean_token_accuracy": 0.7592008709907532, "num_tokens": 18895016.0, "step": 3438, "train/ce_loss": 0.514656662940979 }, { "epoch": 0.3399248566343682, "step": 3438, "train/sim_loss": 0.046875 }, { "epoch": 0.3399248566343682, "step": 3438, "train/total_loss": 0.09834066778421402 }, { "entropy": 9.278051376342773, "epoch": 0.3400237294838837, "mean_token_accuracy": 0.7275000214576721, "num_tokens": 18900382.0, "step": 3439, "train/ce_loss": 0.7733865976333618 }, { "epoch": 0.3400237294838837, "step": 3439, "train/sim_loss": 0.03125 }, { "epoch": 0.3400237294838837, "step": 3439, "train/total_loss": 0.10858865827322006 }, { "epoch": 0.34012260233339925, "grad_norm": 0.8262110948562622, "learning_rate": 9.152202937249667e-06, "loss": 0.1456, "step": 3440 }, { "entropy": 9.293132781982422, "epoch": 0.34012260233339925, "mean_token_accuracy": 0.7381258010864258, "num_tokens": 18905751.0, "step": 3440, "train/ce_loss": 1.0934220552444458 }, { "epoch": 0.34012260233339925, "step": 3440, "train/sim_loss": 0.0625 }, { "epoch": 0.34012260233339925, "step": 3440, "train/total_loss": 0.17184221744537354 }, { "entropy": 9.091936111450195, "epoch": 0.3402214751829148, "mean_token_accuracy": 0.7532933950424194, "num_tokens": 18911332.0, "step": 3441, "train/ce_loss": 0.6853905916213989 }, { "epoch": 0.3402214751829148, "step": 3441, "train/sim_loss": 0.05859375 }, { "epoch": 0.3402214751829148, "step": 3441, "train/total_loss": 0.12713280320167542 }, { "entropy": 8.964405059814453, "epoch": 0.3403203480324303, "mean_token_accuracy": 0.6792849898338318, "num_tokens": 18916888.0, "step": 3442, "train/ce_loss": 0.7411906719207764 }, { "epoch": 0.3403203480324303, "step": 3442, "train/sim_loss": 0.0625 }, { "epoch": 0.3403203480324303, "step": 3442, "train/total_loss": 0.13661906123161316 }, { "entropy": 8.831523895263672, "epoch": 0.3404192208819458, "mean_token_accuracy": 0.7115188837051392, "num_tokens": 18922583.0, "step": 3443, "train/ce_loss": 1.1743850708007812 }, { "epoch": 0.3404192208819458, "step": 3443, "train/sim_loss": 0.0859375 }, { "epoch": 0.3404192208819458, "step": 3443, "train/total_loss": 0.20337601006031036 }, { "entropy": 9.07084846496582, "epoch": 0.34051809373146136, "mean_token_accuracy": 0.7669172883033752, "num_tokens": 18927999.0, "step": 3444, "train/ce_loss": 1.0597543716430664 }, { "epoch": 0.34051809373146136, "step": 3444, "train/sim_loss": 0.109375 }, { "epoch": 0.34051809373146136, "step": 3444, "train/total_loss": 0.2153504490852356 }, { "entropy": 8.824703216552734, "epoch": 0.34061696658097684, "mean_token_accuracy": 0.699999988079071, "num_tokens": 18933668.0, "step": 3445, "train/ce_loss": 2.0979087352752686 }, { "epoch": 0.34061696658097684, "step": 3445, "train/sim_loss": 0.0859375 }, { "epoch": 0.34061696658097684, "step": 3445, "train/total_loss": 0.2957283854484558 }, { "entropy": 8.80289077758789, "epoch": 0.3407158394304924, "mean_token_accuracy": 0.7644710540771484, "num_tokens": 18939253.0, "step": 3446, "train/ce_loss": 0.42786648869514465 }, { "epoch": 0.3407158394304924, "step": 3446, "train/sim_loss": 0.09765625 }, { "epoch": 0.3407158394304924, "step": 3446, "train/total_loss": 0.14044290781021118 }, { "entropy": 9.01729679107666, "epoch": 0.3408147122800079, "mean_token_accuracy": 0.6949541568756104, "num_tokens": 18944663.0, "step": 3447, "train/ce_loss": 0.9936308264732361 }, { "epoch": 0.3408147122800079, "step": 3447, "train/sim_loss": 0.05859375 }, { "epoch": 0.3408147122800079, "step": 3447, "train/total_loss": 0.15795683860778809 }, { "entropy": 9.613985061645508, "epoch": 0.3409135851295234, "mean_token_accuracy": 0.7375415563583374, "num_tokens": 18949908.0, "step": 3448, "train/ce_loss": 0.9047170281410217 }, { "epoch": 0.3409135851295234, "step": 3448, "train/sim_loss": 0.05078125 }, { "epoch": 0.3409135851295234, "step": 3448, "train/total_loss": 0.14125296473503113 }, { "entropy": 8.753910064697266, "epoch": 0.34101245797903895, "mean_token_accuracy": 0.7640449404716492, "num_tokens": 18955483.0, "step": 3449, "train/ce_loss": 0.6565993428230286 }, { "epoch": 0.34101245797903895, "step": 3449, "train/sim_loss": 0.06640625 }, { "epoch": 0.34101245797903895, "step": 3449, "train/total_loss": 0.13206619024276733 }, { "entropy": 9.210835456848145, "epoch": 0.3411113308285545, "mean_token_accuracy": 0.7387606501579285, "num_tokens": 18960918.0, "step": 3450, "train/ce_loss": 0.748189389705658 }, { "epoch": 0.3411113308285545, "step": 3450, "train/sim_loss": 0.10546875 }, { "epoch": 0.3411113308285545, "step": 3450, "train/total_loss": 0.1802876889705658 }, { "entropy": 9.003278732299805, "epoch": 0.34121020367807, "mean_token_accuracy": 0.774944543838501, "num_tokens": 18966465.0, "step": 3451, "train/ce_loss": 0.39196887612342834 }, { "epoch": 0.34121020367807, "step": 3451, "train/sim_loss": 0.0234375 }, { "epoch": 0.34121020367807, "step": 3451, "train/total_loss": 0.06263439357280731 }, { "entropy": 9.133649826049805, "epoch": 0.3413090765275855, "mean_token_accuracy": 0.7163814306259155, "num_tokens": 18971860.0, "step": 3452, "train/ce_loss": 1.1277834177017212 }, { "epoch": 0.3413090765275855, "step": 3452, "train/sim_loss": 0.07421875 }, { "epoch": 0.3413090765275855, "step": 3452, "train/total_loss": 0.18699708580970764 }, { "entropy": 8.829233169555664, "epoch": 0.34140794937710106, "mean_token_accuracy": 0.7010607719421387, "num_tokens": 18977564.0, "step": 3453, "train/ce_loss": 0.7178515195846558 }, { "epoch": 0.34140794937710106, "step": 3453, "train/sim_loss": 0.0234375 }, { "epoch": 0.34140794937710106, "step": 3453, "train/total_loss": 0.09522265195846558 }, { "entropy": 9.113767623901367, "epoch": 0.34150682222661655, "mean_token_accuracy": 0.7747318148612976, "num_tokens": 18982941.0, "step": 3454, "train/ce_loss": 1.0060851573944092 }, { "epoch": 0.34150682222661655, "step": 3454, "train/sim_loss": 0.109375 }, { "epoch": 0.34150682222661655, "step": 3454, "train/total_loss": 0.20998352766036987 }, { "entropy": 8.910078048706055, "epoch": 0.3416056950761321, "mean_token_accuracy": 0.7704240083694458, "num_tokens": 18988588.0, "step": 3455, "train/ce_loss": 0.6327019929885864 }, { "epoch": 0.3416056950761321, "step": 3455, "train/sim_loss": 0.01953125 }, { "epoch": 0.3416056950761321, "step": 3455, "train/total_loss": 0.082801453769207 }, { "entropy": 8.967277526855469, "epoch": 0.34170456792564763, "mean_token_accuracy": 0.7324455380439758, "num_tokens": 18994090.0, "step": 3456, "train/ce_loss": 0.4688180685043335 }, { "epoch": 0.34170456792564763, "step": 3456, "train/sim_loss": 0.03515625 }, { "epoch": 0.34170456792564763, "step": 3456, "train/total_loss": 0.08203805983066559 }, { "entropy": 8.61281681060791, "epoch": 0.3418034407751631, "mean_token_accuracy": 0.7280917167663574, "num_tokens": 18999920.0, "step": 3457, "train/ce_loss": 0.5654793977737427 }, { "epoch": 0.3418034407751631, "step": 3457, "train/sim_loss": 0.03125 }, { "epoch": 0.3418034407751631, "step": 3457, "train/total_loss": 0.08779793977737427 }, { "entropy": 8.9698486328125, "epoch": 0.34190231362467866, "mean_token_accuracy": 0.7900403738021851, "num_tokens": 19005243.0, "step": 3458, "train/ce_loss": 0.766511857509613 }, { "epoch": 0.34190231362467866, "step": 3458, "train/sim_loss": 0.0390625 }, { "epoch": 0.34190231362467866, "step": 3458, "train/total_loss": 0.1157136857509613 }, { "entropy": 8.858774185180664, "epoch": 0.3420011864741942, "mean_token_accuracy": 0.7253433465957642, "num_tokens": 19010716.0, "step": 3459, "train/ce_loss": 1.2651472091674805 }, { "epoch": 0.3420011864741942, "step": 3459, "train/sim_loss": 0.14453125 }, { "epoch": 0.3420011864741942, "step": 3459, "train/total_loss": 0.271045982837677 }, { "epoch": 0.3421000593237097, "grad_norm": 0.8672721982002258, "learning_rate": 9.147258072491719e-06, "loss": 0.1496, "step": 3460 }, { "entropy": 9.186931610107422, "epoch": 0.3421000593237097, "mean_token_accuracy": 0.6875784397125244, "num_tokens": 19016103.0, "step": 3460, "train/ce_loss": 1.3224061727523804 }, { "epoch": 0.3421000593237097, "step": 3460, "train/sim_loss": 0.046875 }, { "epoch": 0.3421000593237097, "step": 3460, "train/total_loss": 0.17911562323570251 }, { "entropy": 9.327954292297363, "epoch": 0.3421989321732252, "mean_token_accuracy": 0.7951977252960205, "num_tokens": 19021368.0, "step": 3461, "train/ce_loss": 0.5632801055908203 }, { "epoch": 0.3421989321732252, "step": 3461, "train/sim_loss": 0.05078125 }, { "epoch": 0.3421989321732252, "step": 3461, "train/total_loss": 0.10710926353931427 }, { "entropy": 8.478191375732422, "epoch": 0.34229780502274076, "mean_token_accuracy": 0.6801988482475281, "num_tokens": 19027078.0, "step": 3462, "train/ce_loss": 0.5687795877456665 }, { "epoch": 0.34229780502274076, "step": 3462, "train/sim_loss": 0.0546875 }, { "epoch": 0.34229780502274076, "step": 3462, "train/total_loss": 0.11156545579433441 }, { "entropy": 8.764240264892578, "epoch": 0.3423966778722563, "mean_token_accuracy": 0.715969979763031, "num_tokens": 19032691.0, "step": 3463, "train/ce_loss": 1.003901720046997 }, { "epoch": 0.3423966778722563, "step": 3463, "train/sim_loss": 0.0625 }, { "epoch": 0.3423966778722563, "step": 3463, "train/total_loss": 0.16289016604423523 }, { "entropy": 8.889501571655273, "epoch": 0.3424955507217718, "mean_token_accuracy": 0.7424931526184082, "num_tokens": 19038395.0, "step": 3464, "train/ce_loss": 0.37960320711135864 }, { "epoch": 0.3424955507217718, "step": 3464, "train/sim_loss": 0.0390625 }, { "epoch": 0.3424955507217718, "step": 3464, "train/total_loss": 0.07702282071113586 }, { "entropy": 8.997015953063965, "epoch": 0.34259442357128733, "mean_token_accuracy": 0.7702845335006714, "num_tokens": 19043951.0, "step": 3465, "train/ce_loss": 0.41346922516822815 }, { "epoch": 0.34259442357128733, "step": 3465, "train/sim_loss": 0.06640625 }, { "epoch": 0.34259442357128733, "step": 3465, "train/total_loss": 0.10775317251682281 }, { "entropy": 8.842507362365723, "epoch": 0.3426932964208029, "mean_token_accuracy": 0.7400000095367432, "num_tokens": 19049532.0, "step": 3466, "train/ce_loss": 0.4944566786289215 }, { "epoch": 0.3426932964208029, "step": 3466, "train/sim_loss": 0.046875 }, { "epoch": 0.3426932964208029, "step": 3466, "train/total_loss": 0.09632067382335663 }, { "entropy": 8.805136680603027, "epoch": 0.34279216927031836, "mean_token_accuracy": 0.7492767572402954, "num_tokens": 19055205.0, "step": 3467, "train/ce_loss": 0.7770282626152039 }, { "epoch": 0.34279216927031836, "step": 3467, "train/sim_loss": 0.05078125 }, { "epoch": 0.34279216927031836, "step": 3467, "train/total_loss": 0.1284840703010559 }, { "entropy": 9.097391128540039, "epoch": 0.3428910421198339, "mean_token_accuracy": 0.7054871320724487, "num_tokens": 19060654.0, "step": 3468, "train/ce_loss": 1.393228530883789 }, { "epoch": 0.3428910421198339, "step": 3468, "train/sim_loss": 0.09375 }, { "epoch": 0.3428910421198339, "step": 3468, "train/total_loss": 0.23307286202907562 }, { "entropy": 9.288183212280273, "epoch": 0.34298991496934944, "mean_token_accuracy": 0.7335329055786133, "num_tokens": 19065949.0, "step": 3469, "train/ce_loss": 0.5422917604446411 }, { "epoch": 0.34298991496934944, "step": 3469, "train/sim_loss": 0.03515625 }, { "epoch": 0.34298991496934944, "step": 3469, "train/total_loss": 0.08938542753458023 }, { "entropy": 8.896051406860352, "epoch": 0.3430887878188649, "mean_token_accuracy": 0.7332601547241211, "num_tokens": 19071408.0, "step": 3470, "train/ce_loss": 0.36136937141418457 }, { "epoch": 0.3430887878188649, "step": 3470, "train/sim_loss": 0.046875 }, { "epoch": 0.3430887878188649, "step": 3470, "train/total_loss": 0.0830119401216507 }, { "entropy": 8.55036735534668, "epoch": 0.34318766066838047, "mean_token_accuracy": 0.7469757795333862, "num_tokens": 19076992.0, "step": 3471, "train/ce_loss": 0.5166313648223877 }, { "epoch": 0.34318766066838047, "step": 3471, "train/sim_loss": 0.046875 }, { "epoch": 0.34318766066838047, "step": 3471, "train/total_loss": 0.09853813797235489 }, { "entropy": 8.811485290527344, "epoch": 0.343286533517896, "mean_token_accuracy": 0.7755101919174194, "num_tokens": 19082518.0, "step": 3472, "train/ce_loss": 0.5732041597366333 }, { "epoch": 0.343286533517896, "step": 3472, "train/sim_loss": 0.0234375 }, { "epoch": 0.343286533517896, "step": 3472, "train/total_loss": 0.08075791597366333 }, { "entropy": 8.946882247924805, "epoch": 0.3433854063674115, "mean_token_accuracy": 0.7103825211524963, "num_tokens": 19088079.0, "step": 3473, "train/ce_loss": 0.7746336460113525 }, { "epoch": 0.3433854063674115, "step": 3473, "train/sim_loss": 0.09375 }, { "epoch": 0.3433854063674115, "step": 3473, "train/total_loss": 0.17121335864067078 }, { "entropy": 9.130309104919434, "epoch": 0.34348427921692704, "mean_token_accuracy": 0.7369093298912048, "num_tokens": 19093442.0, "step": 3474, "train/ce_loss": 0.5016657114028931 }, { "epoch": 0.34348427921692704, "step": 3474, "train/sim_loss": 0.02734375 }, { "epoch": 0.34348427921692704, "step": 3474, "train/total_loss": 0.07751032710075378 }, { "entropy": 9.06291389465332, "epoch": 0.3435831520664426, "mean_token_accuracy": 0.7849711179733276, "num_tokens": 19098915.0, "step": 3475, "train/ce_loss": 0.4369010329246521 }, { "epoch": 0.3435831520664426, "step": 3475, "train/sim_loss": 0.0390625 }, { "epoch": 0.3435831520664426, "step": 3475, "train/total_loss": 0.08275260031223297 }, { "entropy": 9.059921264648438, "epoch": 0.34368202491595806, "mean_token_accuracy": 0.7787182331085205, "num_tokens": 19104377.0, "step": 3476, "train/ce_loss": 0.5902351140975952 }, { "epoch": 0.34368202491595806, "step": 3476, "train/sim_loss": 0.0390625 }, { "epoch": 0.34368202491595806, "step": 3476, "train/total_loss": 0.09808601438999176 }, { "entropy": 8.935262680053711, "epoch": 0.3437808977654736, "mean_token_accuracy": 0.75027996301651, "num_tokens": 19109908.0, "step": 3477, "train/ce_loss": 1.317822813987732 }, { "epoch": 0.3437808977654736, "step": 3477, "train/sim_loss": 0.11328125 }, { "epoch": 0.3437808977654736, "step": 3477, "train/total_loss": 0.24506352841854095 }, { "entropy": 9.183292388916016, "epoch": 0.34387977061498914, "mean_token_accuracy": 0.7575376629829407, "num_tokens": 19115249.0, "step": 3478, "train/ce_loss": 0.5498520731925964 }, { "epoch": 0.34387977061498914, "step": 3478, "train/sim_loss": 0.05859375 }, { "epoch": 0.34387977061498914, "step": 3478, "train/total_loss": 0.11357896029949188 }, { "entropy": 8.506232261657715, "epoch": 0.34397864346450463, "mean_token_accuracy": 0.7023593187332153, "num_tokens": 19120867.0, "step": 3479, "train/ce_loss": 1.6355990171432495 }, { "epoch": 0.34397864346450463, "step": 3479, "train/sim_loss": 0.046875 }, { "epoch": 0.34397864346450463, "step": 3479, "train/total_loss": 0.2104348987340927 }, { "epoch": 0.34407751631402017, "grad_norm": 0.7803540825843811, "learning_rate": 9.14231320773377e-06, "loss": 0.1396, "step": 3480 }, { "entropy": 8.807378768920898, "epoch": 0.34407751631402017, "mean_token_accuracy": 0.7323943376541138, "num_tokens": 19126467.0, "step": 3480, "train/ce_loss": 0.7408857345581055 }, { "epoch": 0.34407751631402017, "step": 3480, "train/sim_loss": 0.07421875 }, { "epoch": 0.34407751631402017, "step": 3480, "train/total_loss": 0.14830732345581055 }, { "entropy": 9.01370906829834, "epoch": 0.3441763891635357, "mean_token_accuracy": 0.7033805847167969, "num_tokens": 19131991.0, "step": 3481, "train/ce_loss": 1.8351975679397583 }, { "epoch": 0.3441763891635357, "step": 3481, "train/sim_loss": 0.10546875 }, { "epoch": 0.3441763891635357, "step": 3481, "train/total_loss": 0.28898853063583374 }, { "entropy": 8.702826499938965, "epoch": 0.3442752620130512, "mean_token_accuracy": 0.7862856984138489, "num_tokens": 19137488.0, "step": 3482, "train/ce_loss": 0.6380066871643066 }, { "epoch": 0.3442752620130512, "step": 3482, "train/sim_loss": 0.05859375 }, { "epoch": 0.3442752620130512, "step": 3482, "train/total_loss": 0.12239442020654678 }, { "entropy": 8.95226001739502, "epoch": 0.34437413486256674, "mean_token_accuracy": 0.7309237122535706, "num_tokens": 19142966.0, "step": 3483, "train/ce_loss": 0.6527575254440308 }, { "epoch": 0.34437413486256674, "step": 3483, "train/sim_loss": 0.01953125 }, { "epoch": 0.34437413486256674, "step": 3483, "train/total_loss": 0.08480700105428696 }, { "entropy": 9.015726089477539, "epoch": 0.3444730077120823, "mean_token_accuracy": 0.7162471413612366, "num_tokens": 19148442.0, "step": 3484, "train/ce_loss": 1.1231226921081543 }, { "epoch": 0.3444730077120823, "step": 3484, "train/sim_loss": 0.08984375 }, { "epoch": 0.3444730077120823, "step": 3484, "train/total_loss": 0.20215602219104767 }, { "entropy": 8.984132766723633, "epoch": 0.34457188056159777, "mean_token_accuracy": 0.7305986881256104, "num_tokens": 19153938.0, "step": 3485, "train/ce_loss": 0.430483877658844 }, { "epoch": 0.34457188056159777, "step": 3485, "train/sim_loss": 0.046875 }, { "epoch": 0.34457188056159777, "step": 3485, "train/total_loss": 0.08992338925600052 }, { "entropy": 8.829473495483398, "epoch": 0.3446707534111133, "mean_token_accuracy": 0.7328482270240784, "num_tokens": 19159569.0, "step": 3486, "train/ce_loss": 0.8806841373443604 }, { "epoch": 0.3446707534111133, "step": 3486, "train/sim_loss": 0.03515625 }, { "epoch": 0.3446707534111133, "step": 3486, "train/total_loss": 0.1232246682047844 }, { "entropy": 8.833452224731445, "epoch": 0.34476962626062885, "mean_token_accuracy": 0.7754654884338379, "num_tokens": 19165073.0, "step": 3487, "train/ce_loss": 1.0423998832702637 }, { "epoch": 0.34476962626062885, "step": 3487, "train/sim_loss": 0.06640625 }, { "epoch": 0.34476962626062885, "step": 3487, "train/total_loss": 0.17064625024795532 }, { "entropy": 9.219670295715332, "epoch": 0.34486849911014433, "mean_token_accuracy": 0.6779661178588867, "num_tokens": 19170378.0, "step": 3488, "train/ce_loss": 0.95316481590271 }, { "epoch": 0.34486849911014433, "step": 3488, "train/sim_loss": 0.0625 }, { "epoch": 0.34486849911014433, "step": 3488, "train/total_loss": 0.15781648457050323 }, { "entropy": 8.971220016479492, "epoch": 0.3449673719596599, "mean_token_accuracy": 0.7157232761383057, "num_tokens": 19175808.0, "step": 3489, "train/ce_loss": 0.46807798743247986 }, { "epoch": 0.3449673719596599, "step": 3489, "train/sim_loss": 0.05859375 }, { "epoch": 0.3449673719596599, "step": 3489, "train/total_loss": 0.10540154576301575 }, { "entropy": 9.093448638916016, "epoch": 0.3450662448091754, "mean_token_accuracy": 0.732467532157898, "num_tokens": 19181212.0, "step": 3490, "train/ce_loss": 0.9909375309944153 }, { "epoch": 0.3450662448091754, "step": 3490, "train/sim_loss": 0.0625 }, { "epoch": 0.3450662448091754, "step": 3490, "train/total_loss": 0.16159376502037048 }, { "entropy": 9.162252426147461, "epoch": 0.3451651176586909, "mean_token_accuracy": 0.7771883010864258, "num_tokens": 19186582.0, "step": 3491, "train/ce_loss": 0.7377031445503235 }, { "epoch": 0.3451651176586909, "step": 3491, "train/sim_loss": 0.0625 }, { "epoch": 0.3451651176586909, "step": 3491, "train/total_loss": 0.13627031445503235 }, { "entropy": 9.048967361450195, "epoch": 0.34526399050820644, "mean_token_accuracy": 0.799315869808197, "num_tokens": 19192105.0, "step": 3492, "train/ce_loss": 0.7833853363990784 }, { "epoch": 0.34526399050820644, "step": 3492, "train/sim_loss": 0.07421875 }, { "epoch": 0.34526399050820644, "step": 3492, "train/total_loss": 0.15255728363990784 }, { "entropy": 8.736431121826172, "epoch": 0.345362863357722, "mean_token_accuracy": 0.7442965507507324, "num_tokens": 19197764.0, "step": 3493, "train/ce_loss": 1.0979288816452026 }, { "epoch": 0.345362863357722, "step": 3493, "train/sim_loss": 0.03515625 }, { "epoch": 0.345362863357722, "step": 3493, "train/total_loss": 0.14494913816452026 }, { "entropy": 8.978483200073242, "epoch": 0.34546173620723747, "mean_token_accuracy": 0.7523302435874939, "num_tokens": 19203114.0, "step": 3494, "train/ce_loss": 0.6333264112472534 }, { "epoch": 0.34546173620723747, "step": 3494, "train/sim_loss": 0.0390625 }, { "epoch": 0.34546173620723747, "step": 3494, "train/total_loss": 0.10239513963460922 }, { "entropy": 9.005317687988281, "epoch": 0.345560609056753, "mean_token_accuracy": 0.7472392916679382, "num_tokens": 19208595.0, "step": 3495, "train/ce_loss": 1.2684228420257568 }, { "epoch": 0.345560609056753, "step": 3495, "train/sim_loss": 0.05078125 }, { "epoch": 0.345560609056753, "step": 3495, "train/total_loss": 0.17762354016304016 }, { "entropy": 9.338470458984375, "epoch": 0.34565948190626855, "mean_token_accuracy": 0.7316715717315674, "num_tokens": 19213860.0, "step": 3496, "train/ce_loss": 0.5931958556175232 }, { "epoch": 0.34565948190626855, "step": 3496, "train/sim_loss": 0.015625 }, { "epoch": 0.34565948190626855, "step": 3496, "train/total_loss": 0.07494458556175232 }, { "entropy": 8.87402629852295, "epoch": 0.34575835475578404, "mean_token_accuracy": 0.7842046618461609, "num_tokens": 19219403.0, "step": 3497, "train/ce_loss": 0.5546629428863525 }, { "epoch": 0.34575835475578404, "step": 3497, "train/sim_loss": 0.02734375 }, { "epoch": 0.34575835475578404, "step": 3497, "train/total_loss": 0.08281004428863525 }, { "entropy": 8.551200866699219, "epoch": 0.3458572276052996, "mean_token_accuracy": 0.7327188849449158, "num_tokens": 19225318.0, "step": 3498, "train/ce_loss": 0.3824247419834137 }, { "epoch": 0.3458572276052996, "step": 3498, "train/sim_loss": 0.09765625 }, { "epoch": 0.3458572276052996, "step": 3498, "train/total_loss": 0.13589872419834137 }, { "entropy": 8.992961883544922, "epoch": 0.3459561004548151, "mean_token_accuracy": 0.768873393535614, "num_tokens": 19230727.0, "step": 3499, "train/ce_loss": 1.1188433170318604 }, { "epoch": 0.3459561004548151, "step": 3499, "train/sim_loss": 0.0390625 }, { "epoch": 0.3459561004548151, "step": 3499, "train/total_loss": 0.15094682574272156 }, { "epoch": 0.3460549733043306, "grad_norm": 0.6598859429359436, "learning_rate": 9.137368342975821e-06, "loss": 0.1412, "step": 3500 }, { "entropy": 9.083660125732422, "epoch": 0.3460549733043306, "mean_token_accuracy": 0.7335025668144226, "num_tokens": 19236081.0, "step": 3500, "train/ce_loss": 0.5867977738380432 }, { "epoch": 0.3460549733043306, "step": 3500, "train/sim_loss": 0.05859375 }, { "epoch": 0.3460549733043306, "step": 3500, "train/total_loss": 0.11727352440357208 }, { "entropy": 9.03471565246582, "epoch": 0.34615384615384615, "mean_token_accuracy": 0.7369109988212585, "num_tokens": 19241425.0, "step": 3501, "train/ce_loss": 0.875075101852417 }, { "epoch": 0.34615384615384615, "step": 3501, "train/sim_loss": 0.07421875 }, { "epoch": 0.34615384615384615, "step": 3501, "train/total_loss": 0.16172626614570618 }, { "entropy": 9.119497299194336, "epoch": 0.3462527190033617, "mean_token_accuracy": 0.7786561250686646, "num_tokens": 19246810.0, "step": 3502, "train/ce_loss": 0.5451950430870056 }, { "epoch": 0.3462527190033617, "step": 3502, "train/sim_loss": 0.0390625 }, { "epoch": 0.3462527190033617, "step": 3502, "train/total_loss": 0.09358200430870056 }, { "entropy": 8.560791015625, "epoch": 0.3463515918528772, "mean_token_accuracy": 0.6858552694320679, "num_tokens": 19252616.0, "step": 3503, "train/ce_loss": 2.0479705333709717 }, { "epoch": 0.3463515918528772, "step": 3503, "train/sim_loss": 0.0859375 }, { "epoch": 0.3463515918528772, "step": 3503, "train/total_loss": 0.29073455929756165 }, { "entropy": 8.868547439575195, "epoch": 0.3464504647023927, "mean_token_accuracy": 0.7036144733428955, "num_tokens": 19258078.0, "step": 3504, "train/ce_loss": 1.4053865671157837 }, { "epoch": 0.3464504647023927, "step": 3504, "train/sim_loss": 0.05859375 }, { "epoch": 0.3464504647023927, "step": 3504, "train/total_loss": 0.19913241267204285 }, { "entropy": 9.107654571533203, "epoch": 0.34654933755190825, "mean_token_accuracy": 0.748031497001648, "num_tokens": 19263438.0, "step": 3505, "train/ce_loss": 0.7051311731338501 }, { "epoch": 0.34654933755190825, "step": 3505, "train/sim_loss": 0.0234375 }, { "epoch": 0.34654933755190825, "step": 3505, "train/total_loss": 0.09395062178373337 }, { "entropy": 8.882123947143555, "epoch": 0.3466482104014238, "mean_token_accuracy": 0.7122153043746948, "num_tokens": 19268979.0, "step": 3506, "train/ce_loss": 0.6533194184303284 }, { "epoch": 0.3466482104014238, "step": 3506, "train/sim_loss": 0.0390625 }, { "epoch": 0.3466482104014238, "step": 3506, "train/total_loss": 0.10439444333314896 }, { "entropy": 8.912294387817383, "epoch": 0.3467470832509393, "mean_token_accuracy": 0.7029831409454346, "num_tokens": 19274342.0, "step": 3507, "train/ce_loss": 1.2075226306915283 }, { "epoch": 0.3467470832509393, "step": 3507, "train/sim_loss": 0.125 }, { "epoch": 0.3467470832509393, "step": 3507, "train/total_loss": 0.2457522749900818 }, { "entropy": 8.165135383605957, "epoch": 0.3468459561004548, "mean_token_accuracy": 0.7038461565971375, "num_tokens": 19280265.0, "step": 3508, "train/ce_loss": 0.5031284093856812 }, { "epoch": 0.3468459561004548, "step": 3508, "train/sim_loss": 0.0390625 }, { "epoch": 0.3468459561004548, "step": 3508, "train/total_loss": 0.08937534689903259 }, { "entropy": 8.967950820922852, "epoch": 0.34694482894997036, "mean_token_accuracy": 0.7234803438186646, "num_tokens": 19285681.0, "step": 3509, "train/ce_loss": 0.7562479972839355 }, { "epoch": 0.34694482894997036, "step": 3509, "train/sim_loss": 0.06640625 }, { "epoch": 0.34694482894997036, "step": 3509, "train/total_loss": 0.14203104376792908 }, { "entropy": 8.676383972167969, "epoch": 0.34704370179948585, "mean_token_accuracy": 0.7008403539657593, "num_tokens": 19291454.0, "step": 3510, "train/ce_loss": 1.7366557121276855 }, { "epoch": 0.34704370179948585, "step": 3510, "train/sim_loss": 0.04296875 }, { "epoch": 0.34704370179948585, "step": 3510, "train/total_loss": 0.21663431823253632 }, { "entropy": 8.861550331115723, "epoch": 0.3471425746490014, "mean_token_accuracy": 0.8242009282112122, "num_tokens": 19296958.0, "step": 3511, "train/ce_loss": 0.649535596370697 }, { "epoch": 0.3471425746490014, "step": 3511, "train/sim_loss": 0.0859375 }, { "epoch": 0.3471425746490014, "step": 3511, "train/total_loss": 0.15089106559753418 }, { "entropy": 8.977502822875977, "epoch": 0.34724144749851693, "mean_token_accuracy": 0.7206704020500183, "num_tokens": 19302455.0, "step": 3512, "train/ce_loss": 0.8881929516792297 }, { "epoch": 0.34724144749851693, "step": 3512, "train/sim_loss": 0.0546875 }, { "epoch": 0.34724144749851693, "step": 3512, "train/total_loss": 0.14350679516792297 }, { "entropy": 8.871850967407227, "epoch": 0.3473403203480324, "mean_token_accuracy": 0.7303974032402039, "num_tokens": 19308068.0, "step": 3513, "train/ce_loss": 0.7578152418136597 }, { "epoch": 0.3473403203480324, "step": 3513, "train/sim_loss": 0.046875 }, { "epoch": 0.3473403203480324, "step": 3513, "train/total_loss": 0.12265652418136597 }, { "entropy": 8.95494270324707, "epoch": 0.34743919319754796, "mean_token_accuracy": 0.756242573261261, "num_tokens": 19313561.0, "step": 3514, "train/ce_loss": 0.588775098323822 }, { "epoch": 0.34743919319754796, "step": 3514, "train/sim_loss": 0.0546875 }, { "epoch": 0.34743919319754796, "step": 3514, "train/total_loss": 0.11356501281261444 }, { "entropy": 8.802218437194824, "epoch": 0.3475380660470635, "mean_token_accuracy": 0.7538644671440125, "num_tokens": 19319035.0, "step": 3515, "train/ce_loss": 0.7569316029548645 }, { "epoch": 0.3475380660470635, "step": 3515, "train/sim_loss": 0.05078125 }, { "epoch": 0.3475380660470635, "step": 3515, "train/total_loss": 0.12647441029548645 }, { "entropy": 9.016779899597168, "epoch": 0.347636938896579, "mean_token_accuracy": 0.7216610312461853, "num_tokens": 19324513.0, "step": 3516, "train/ce_loss": 0.9612802863121033 }, { "epoch": 0.347636938896579, "step": 3516, "train/sim_loss": 0.046875 }, { "epoch": 0.347636938896579, "step": 3516, "train/total_loss": 0.14300303161144257 }, { "entropy": 8.713796615600586, "epoch": 0.3477358117460945, "mean_token_accuracy": 0.7157894968986511, "num_tokens": 19330034.0, "step": 3517, "train/ce_loss": 0.570966362953186 }, { "epoch": 0.3477358117460945, "step": 3517, "train/sim_loss": 0.0625 }, { "epoch": 0.3477358117460945, "step": 3517, "train/total_loss": 0.11959663778543472 }, { "entropy": 8.413741111755371, "epoch": 0.34783468459561007, "mean_token_accuracy": 0.7407407164573669, "num_tokens": 19335851.0, "step": 3518, "train/ce_loss": 0.5531697869300842 }, { "epoch": 0.34783468459561007, "step": 3518, "train/sim_loss": 0.03515625 }, { "epoch": 0.34783468459561007, "step": 3518, "train/total_loss": 0.0904732346534729 }, { "entropy": 9.118605613708496, "epoch": 0.34793355744512555, "mean_token_accuracy": 0.7291428446769714, "num_tokens": 19341322.0, "step": 3519, "train/ce_loss": 0.5572152137756348 }, { "epoch": 0.34793355744512555, "step": 3519, "train/sim_loss": 0.0234375 }, { "epoch": 0.34793355744512555, "step": 3519, "train/total_loss": 0.07915902137756348 }, { "epoch": 0.3480324302946411, "grad_norm": 0.8082781434059143, "learning_rate": 9.132423478217872e-06, "loss": 0.1504, "step": 3520 }, { "entropy": 9.05672836303711, "epoch": 0.3480324302946411, "mean_token_accuracy": 0.7904993891716003, "num_tokens": 19346698.0, "step": 3520, "train/ce_loss": 0.7014380097389221 }, { "epoch": 0.3480324302946411, "step": 3520, "train/sim_loss": 0.015625 }, { "epoch": 0.3480324302946411, "step": 3520, "train/total_loss": 0.08576880395412445 }, { "entropy": 9.147746086120605, "epoch": 0.34813130314415663, "mean_token_accuracy": 0.7223650217056274, "num_tokens": 19352115.0, "step": 3521, "train/ce_loss": 0.6889724731445312 }, { "epoch": 0.34813130314415663, "step": 3521, "train/sim_loss": 0.046875 }, { "epoch": 0.34813130314415663, "step": 3521, "train/total_loss": 0.11577224731445312 }, { "entropy": 8.777778625488281, "epoch": 0.3482301759936721, "mean_token_accuracy": 0.6995841860771179, "num_tokens": 19357713.0, "step": 3522, "train/ce_loss": 1.1292766332626343 }, { "epoch": 0.3482301759936721, "step": 3522, "train/sim_loss": 0.140625 }, { "epoch": 0.3482301759936721, "step": 3522, "train/total_loss": 0.2535526752471924 }, { "entropy": 9.159420013427734, "epoch": 0.34832904884318766, "mean_token_accuracy": 0.7718309760093689, "num_tokens": 19363038.0, "step": 3523, "train/ce_loss": 0.7333871126174927 }, { "epoch": 0.34832904884318766, "step": 3523, "train/sim_loss": 0.0703125 }, { "epoch": 0.34832904884318766, "step": 3523, "train/total_loss": 0.14365121722221375 }, { "entropy": 8.803606033325195, "epoch": 0.3484279216927032, "mean_token_accuracy": 0.7132530212402344, "num_tokens": 19368491.0, "step": 3524, "train/ce_loss": 0.7194620370864868 }, { "epoch": 0.3484279216927032, "step": 3524, "train/sim_loss": 0.07421875 }, { "epoch": 0.3484279216927032, "step": 3524, "train/total_loss": 0.14616495370864868 }, { "entropy": 8.717127799987793, "epoch": 0.3485267945422187, "mean_token_accuracy": 0.7417893409729004, "num_tokens": 19373990.0, "step": 3525, "train/ce_loss": 0.5860023498535156 }, { "epoch": 0.3485267945422187, "step": 3525, "train/sim_loss": 0.05078125 }, { "epoch": 0.3485267945422187, "step": 3525, "train/total_loss": 0.10938148200511932 }, { "entropy": 9.078065872192383, "epoch": 0.34862566739173423, "mean_token_accuracy": 0.8027777671813965, "num_tokens": 19379287.0, "step": 3526, "train/ce_loss": 0.5148441195487976 }, { "epoch": 0.34862566739173423, "step": 3526, "train/sim_loss": 0.02734375 }, { "epoch": 0.34862566739173423, "step": 3526, "train/total_loss": 0.07882816344499588 }, { "entropy": 9.192644119262695, "epoch": 0.34872454024124977, "mean_token_accuracy": 0.7783505320549011, "num_tokens": 19384653.0, "step": 3527, "train/ce_loss": 0.7134721875190735 }, { "epoch": 0.34872454024124977, "step": 3527, "train/sim_loss": 0.04296875 }, { "epoch": 0.34872454024124977, "step": 3527, "train/total_loss": 0.11431597173213959 }, { "entropy": 9.432594299316406, "epoch": 0.34882341309076526, "mean_token_accuracy": 0.7463768124580383, "num_tokens": 19389931.0, "step": 3528, "train/ce_loss": 1.0656017065048218 }, { "epoch": 0.34882341309076526, "step": 3528, "train/sim_loss": 0.0546875 }, { "epoch": 0.34882341309076526, "step": 3528, "train/total_loss": 0.16124767065048218 }, { "entropy": 9.125032424926758, "epoch": 0.3489222859402808, "mean_token_accuracy": 0.6890756487846375, "num_tokens": 19395376.0, "step": 3529, "train/ce_loss": 0.6409738659858704 }, { "epoch": 0.3489222859402808, "step": 3529, "train/sim_loss": 0.03125 }, { "epoch": 0.3489222859402808, "step": 3529, "train/total_loss": 0.09534738957881927 }, { "entropy": 9.03551197052002, "epoch": 0.34902115878979634, "mean_token_accuracy": 0.7934508919715881, "num_tokens": 19400730.0, "step": 3530, "train/ce_loss": 0.4952864944934845 }, { "epoch": 0.34902115878979634, "step": 3530, "train/sim_loss": 0.03125 }, { "epoch": 0.34902115878979634, "step": 3530, "train/total_loss": 0.08077865093946457 }, { "entropy": 8.843345642089844, "epoch": 0.3491200316393118, "mean_token_accuracy": 0.733264684677124, "num_tokens": 19406264.0, "step": 3531, "train/ce_loss": 1.5205810070037842 }, { "epoch": 0.3491200316393118, "step": 3531, "train/sim_loss": 0.0859375 }, { "epoch": 0.3491200316393118, "step": 3531, "train/total_loss": 0.23799560964107513 }, { "entropy": 9.155932426452637, "epoch": 0.34921890448882736, "mean_token_accuracy": 0.8102625012397766, "num_tokens": 19411775.0, "step": 3532, "train/ce_loss": 0.558832585811615 }, { "epoch": 0.34921890448882736, "step": 3532, "train/sim_loss": 0.02734375 }, { "epoch": 0.34921890448882736, "step": 3532, "train/total_loss": 0.0832270085811615 }, { "entropy": 8.800640106201172, "epoch": 0.3493177773383429, "mean_token_accuracy": 0.7384305596351624, "num_tokens": 19417540.0, "step": 3533, "train/ce_loss": 1.1896774768829346 }, { "epoch": 0.3493177773383429, "step": 3533, "train/sim_loss": 0.08984375 }, { "epoch": 0.3493177773383429, "step": 3533, "train/total_loss": 0.20881149172782898 }, { "entropy": 8.958824157714844, "epoch": 0.3494166501878584, "mean_token_accuracy": 0.7535714507102966, "num_tokens": 19423002.0, "step": 3534, "train/ce_loss": 0.4619155824184418 }, { "epoch": 0.3494166501878584, "step": 3534, "train/sim_loss": 0.05078125 }, { "epoch": 0.3494166501878584, "step": 3534, "train/total_loss": 0.09697280824184418 }, { "entropy": 8.982423782348633, "epoch": 0.34951552303737393, "mean_token_accuracy": 0.7446300983428955, "num_tokens": 19428477.0, "step": 3535, "train/ce_loss": 0.6341041326522827 }, { "epoch": 0.34951552303737393, "step": 3535, "train/sim_loss": 0.05859375 }, { "epoch": 0.34951552303737393, "step": 3535, "train/total_loss": 0.12200416624546051 }, { "entropy": 9.48110580444336, "epoch": 0.3496143958868895, "mean_token_accuracy": 0.7595993280410767, "num_tokens": 19433726.0, "step": 3536, "train/ce_loss": 0.5529555678367615 }, { "epoch": 0.3496143958868895, "step": 3536, "train/sim_loss": 0.0625 }, { "epoch": 0.3496143958868895, "step": 3536, "train/total_loss": 0.11779555678367615 }, { "entropy": 8.851311683654785, "epoch": 0.34971326873640496, "mean_token_accuracy": 0.8126919269561768, "num_tokens": 19439315.0, "step": 3537, "train/ce_loss": 0.43004608154296875 }, { "epoch": 0.34971326873640496, "step": 3537, "train/sim_loss": 0.03125 }, { "epoch": 0.34971326873640496, "step": 3537, "train/total_loss": 0.074254609644413 }, { "entropy": 8.847206115722656, "epoch": 0.3498121415859205, "mean_token_accuracy": 0.7279070019721985, "num_tokens": 19444831.0, "step": 3538, "train/ce_loss": 1.181033968925476 }, { "epoch": 0.3498121415859205, "step": 3538, "train/sim_loss": 0.0703125 }, { "epoch": 0.3498121415859205, "step": 3538, "train/total_loss": 0.18841589987277985 }, { "entropy": 9.071733474731445, "epoch": 0.34991101443543604, "mean_token_accuracy": 0.7607361674308777, "num_tokens": 19450330.0, "step": 3539, "train/ce_loss": 1.0913355350494385 }, { "epoch": 0.34991101443543604, "step": 3539, "train/sim_loss": 0.08203125 }, { "epoch": 0.34991101443543604, "step": 3539, "train/total_loss": 0.1911648064851761 }, { "epoch": 0.3500098872849515, "grad_norm": 0.7812337279319763, "learning_rate": 9.127478613459922e-06, "loss": 0.1414, "step": 3540 }, { "entropy": 9.068065643310547, "epoch": 0.3500098872849515, "mean_token_accuracy": 0.7223719954490662, "num_tokens": 19455712.0, "step": 3540, "train/ce_loss": 1.3006138801574707 }, { "epoch": 0.3500098872849515, "step": 3540, "train/sim_loss": 0.0703125 }, { "epoch": 0.3500098872849515, "step": 3540, "train/total_loss": 0.20037388801574707 }, { "entropy": 8.76467514038086, "epoch": 0.35010876013446707, "mean_token_accuracy": 0.7331995964050293, "num_tokens": 19461322.0, "step": 3541, "train/ce_loss": 1.1381933689117432 }, { "epoch": 0.35010876013446707, "step": 3541, "train/sim_loss": 0.078125 }, { "epoch": 0.35010876013446707, "step": 3541, "train/total_loss": 0.19194433093070984 }, { "entropy": 8.909282684326172, "epoch": 0.3502076329839826, "mean_token_accuracy": 0.6983425617218018, "num_tokens": 19466852.0, "step": 3542, "train/ce_loss": 0.8952155113220215 }, { "epoch": 0.3502076329839826, "step": 3542, "train/sim_loss": 0.09375 }, { "epoch": 0.3502076329839826, "step": 3542, "train/total_loss": 0.18327155709266663 }, { "entropy": 8.938924789428711, "epoch": 0.3503065058334981, "mean_token_accuracy": 0.7459119558334351, "num_tokens": 19472332.0, "step": 3543, "train/ce_loss": 0.43490663170814514 }, { "epoch": 0.3503065058334981, "step": 3543, "train/sim_loss": 0.0625 }, { "epoch": 0.3503065058334981, "step": 3543, "train/total_loss": 0.10599066317081451 }, { "entropy": 8.801193237304688, "epoch": 0.35040537868301364, "mean_token_accuracy": 0.7355769276618958, "num_tokens": 19477730.0, "step": 3544, "train/ce_loss": 0.5320839881896973 }, { "epoch": 0.35040537868301364, "step": 3544, "train/sim_loss": 0.0546875 }, { "epoch": 0.35040537868301364, "step": 3544, "train/total_loss": 0.10789589583873749 }, { "entropy": 8.636659622192383, "epoch": 0.3505042515325292, "mean_token_accuracy": 0.7686939239501953, "num_tokens": 19483405.0, "step": 3545, "train/ce_loss": 0.9625001549720764 }, { "epoch": 0.3505042515325292, "step": 3545, "train/sim_loss": 0.09765625 }, { "epoch": 0.3505042515325292, "step": 3545, "train/total_loss": 0.1939062774181366 }, { "entropy": 8.971939086914062, "epoch": 0.3506031243820447, "mean_token_accuracy": 0.798701286315918, "num_tokens": 19488982.0, "step": 3546, "train/ce_loss": 0.4535934627056122 }, { "epoch": 0.3506031243820447, "step": 3546, "train/sim_loss": 0.09375 }, { "epoch": 0.3506031243820447, "step": 3546, "train/total_loss": 0.13910934329032898 }, { "entropy": 9.127031326293945, "epoch": 0.3507019972315602, "mean_token_accuracy": 0.7516425848007202, "num_tokens": 19494369.0, "step": 3547, "train/ce_loss": 0.7496052384376526 }, { "epoch": 0.3507019972315602, "step": 3547, "train/sim_loss": 0.0546875 }, { "epoch": 0.3507019972315602, "step": 3547, "train/total_loss": 0.12964802980422974 }, { "entropy": 8.799887657165527, "epoch": 0.35080087008107574, "mean_token_accuracy": 0.7385759949684143, "num_tokens": 19499852.0, "step": 3548, "train/ce_loss": 0.8976070880889893 }, { "epoch": 0.35080087008107574, "step": 3548, "train/sim_loss": 0.0703125 }, { "epoch": 0.35080087008107574, "step": 3548, "train/total_loss": 0.16007322072982788 }, { "entropy": 8.661090850830078, "epoch": 0.3508997429305913, "mean_token_accuracy": 0.7094240784645081, "num_tokens": 19505557.0, "step": 3549, "train/ce_loss": 0.37469831109046936 }, { "epoch": 0.3508997429305913, "step": 3549, "train/sim_loss": 0.05078125 }, { "epoch": 0.3508997429305913, "step": 3549, "train/total_loss": 0.08825108408927917 }, { "entropy": 8.97684383392334, "epoch": 0.35099861578010677, "mean_token_accuracy": 0.7567251324653625, "num_tokens": 19511086.0, "step": 3550, "train/ce_loss": 0.7544493675231934 }, { "epoch": 0.35099861578010677, "step": 3550, "train/sim_loss": 0.07421875 }, { "epoch": 0.35099861578010677, "step": 3550, "train/total_loss": 0.14966368675231934 }, { "entropy": 9.009611129760742, "epoch": 0.3510974886296223, "mean_token_accuracy": 0.6964048147201538, "num_tokens": 19516463.0, "step": 3551, "train/ce_loss": 1.1153016090393066 }, { "epoch": 0.3510974886296223, "step": 3551, "train/sim_loss": 0.05078125 }, { "epoch": 0.3510974886296223, "step": 3551, "train/total_loss": 0.1623114049434662 }, { "entropy": 9.0093355178833, "epoch": 0.35119636147913785, "mean_token_accuracy": 0.8011834025382996, "num_tokens": 19521891.0, "step": 3552, "train/ce_loss": 0.7906011343002319 }, { "epoch": 0.35119636147913785, "step": 3552, "train/sim_loss": 0.08984375 }, { "epoch": 0.35119636147913785, "step": 3552, "train/total_loss": 0.16890385746955872 }, { "entropy": 9.097875595092773, "epoch": 0.35129523432865334, "mean_token_accuracy": 0.7267355918884277, "num_tokens": 19527036.0, "step": 3553, "train/ce_loss": 0.8458390831947327 }, { "epoch": 0.35129523432865334, "step": 3553, "train/sim_loss": 0.06640625 }, { "epoch": 0.35129523432865334, "step": 3553, "train/total_loss": 0.15099015831947327 }, { "entropy": 8.664535522460938, "epoch": 0.3513941071781689, "mean_token_accuracy": 0.7303240895271301, "num_tokens": 19532546.0, "step": 3554, "train/ce_loss": 0.8683934807777405 }, { "epoch": 0.3513941071781689, "step": 3554, "train/sim_loss": 0.07421875 }, { "epoch": 0.3513941071781689, "step": 3554, "train/total_loss": 0.16105809807777405 }, { "entropy": 9.069976806640625, "epoch": 0.3514929800276844, "mean_token_accuracy": 0.7551724314689636, "num_tokens": 19538040.0, "step": 3555, "train/ce_loss": 0.4596770107746124 }, { "epoch": 0.3514929800276844, "step": 3555, "train/sim_loss": 0.05859375 }, { "epoch": 0.3514929800276844, "step": 3555, "train/total_loss": 0.104561448097229 }, { "entropy": 9.261690139770508, "epoch": 0.3515918528771999, "mean_token_accuracy": 0.725895345211029, "num_tokens": 19543412.0, "step": 3556, "train/ce_loss": 0.6765669584274292 }, { "epoch": 0.3515918528771999, "step": 3556, "train/sim_loss": 0.07421875 }, { "epoch": 0.3515918528771999, "step": 3556, "train/total_loss": 0.14187544584274292 }, { "entropy": 8.661426544189453, "epoch": 0.35169072572671545, "mean_token_accuracy": 0.728515625, "num_tokens": 19548994.0, "step": 3557, "train/ce_loss": 1.390517234802246 }, { "epoch": 0.35169072572671545, "step": 3557, "train/sim_loss": 0.05078125 }, { "epoch": 0.35169072572671545, "step": 3557, "train/total_loss": 0.18983297049999237 }, { "entropy": 9.260269165039062, "epoch": 0.351789598576231, "mean_token_accuracy": 0.7596685290336609, "num_tokens": 19554262.0, "step": 3558, "train/ce_loss": 0.8167557120323181 }, { "epoch": 0.351789598576231, "step": 3558, "train/sim_loss": 0.05859375 }, { "epoch": 0.351789598576231, "step": 3558, "train/total_loss": 0.14026932418346405 }, { "entropy": 9.050132751464844, "epoch": 0.3518884714257465, "mean_token_accuracy": 0.7635359168052673, "num_tokens": 19559794.0, "step": 3559, "train/ce_loss": 0.48740413784980774 }, { "epoch": 0.3518884714257465, "step": 3559, "train/sim_loss": 0.08984375 }, { "epoch": 0.3518884714257465, "step": 3559, "train/total_loss": 0.138584166765213 }, { "epoch": 0.351987344275262, "grad_norm": 0.7367194294929504, "learning_rate": 9.122533748701975e-06, "loss": 0.1452, "step": 3560 }, { "entropy": 9.127718925476074, "epoch": 0.351987344275262, "mean_token_accuracy": 0.7258485555648804, "num_tokens": 19565197.0, "step": 3560, "train/ce_loss": 0.8932803273200989 }, { "epoch": 0.351987344275262, "step": 3560, "train/sim_loss": 0.09375 }, { "epoch": 0.351987344275262, "step": 3560, "train/total_loss": 0.18307803571224213 }, { "entropy": 8.590702056884766, "epoch": 0.35208621712477756, "mean_token_accuracy": 0.7166095972061157, "num_tokens": 19570825.0, "step": 3561, "train/ce_loss": 0.7108856439590454 }, { "epoch": 0.35208621712477756, "step": 3561, "train/sim_loss": 0.04296875 }, { "epoch": 0.35208621712477756, "step": 3561, "train/total_loss": 0.11405731737613678 }, { "entropy": 9.163217544555664, "epoch": 0.35218508997429304, "mean_token_accuracy": 0.7408804893493652, "num_tokens": 19576261.0, "step": 3562, "train/ce_loss": 1.5136717557907104 }, { "epoch": 0.35218508997429304, "step": 3562, "train/sim_loss": 0.08984375 }, { "epoch": 0.35218508997429304, "step": 3562, "train/total_loss": 0.2412109225988388 }, { "entropy": 8.73428726196289, "epoch": 0.3522839628238086, "mean_token_accuracy": 0.7209039330482483, "num_tokens": 19581804.0, "step": 3563, "train/ce_loss": 1.146021842956543 }, { "epoch": 0.3522839628238086, "step": 3563, "train/sim_loss": 0.06640625 }, { "epoch": 0.3522839628238086, "step": 3563, "train/total_loss": 0.18100842833518982 }, { "entropy": 8.841144561767578, "epoch": 0.3523828356733241, "mean_token_accuracy": 0.729608952999115, "num_tokens": 19587356.0, "step": 3564, "train/ce_loss": 0.6340915560722351 }, { "epoch": 0.3523828356733241, "step": 3564, "train/sim_loss": 0.08203125 }, { "epoch": 0.3523828356733241, "step": 3564, "train/total_loss": 0.14544039964675903 }, { "entropy": 8.889899253845215, "epoch": 0.3524817085228396, "mean_token_accuracy": 0.8131042122840881, "num_tokens": 19592927.0, "step": 3565, "train/ce_loss": 0.6071549654006958 }, { "epoch": 0.3524817085228396, "step": 3565, "train/sim_loss": 0.07421875 }, { "epoch": 0.3524817085228396, "step": 3565, "train/total_loss": 0.13493424654006958 }, { "entropy": 9.198972702026367, "epoch": 0.35258058137235515, "mean_token_accuracy": 0.7852112650871277, "num_tokens": 19598331.0, "step": 3566, "train/ce_loss": 0.6926777958869934 }, { "epoch": 0.35258058137235515, "step": 3566, "train/sim_loss": 0.046875 }, { "epoch": 0.35258058137235515, "step": 3566, "train/total_loss": 0.11614277958869934 }, { "entropy": 8.85969066619873, "epoch": 0.3526794542218707, "mean_token_accuracy": 0.7527352571487427, "num_tokens": 19603858.0, "step": 3567, "train/ce_loss": 0.8991467952728271 }, { "epoch": 0.3526794542218707, "step": 3567, "train/sim_loss": 0.0546875 }, { "epoch": 0.3526794542218707, "step": 3567, "train/total_loss": 0.14460217952728271 }, { "entropy": 9.057412147521973, "epoch": 0.3527783270713862, "mean_token_accuracy": 0.7562723755836487, "num_tokens": 19609233.0, "step": 3568, "train/ce_loss": 0.3857838213443756 }, { "epoch": 0.3527783270713862, "step": 3568, "train/sim_loss": 0.0234375 }, { "epoch": 0.3527783270713862, "step": 3568, "train/total_loss": 0.06201588362455368 }, { "entropy": 8.899857521057129, "epoch": 0.3528771999209017, "mean_token_accuracy": 0.7986029982566833, "num_tokens": 19614673.0, "step": 3569, "train/ce_loss": 1.003015398979187 }, { "epoch": 0.3528771999209017, "step": 3569, "train/sim_loss": 0.0546875 }, { "epoch": 0.3528771999209017, "step": 3569, "train/total_loss": 0.15498903393745422 }, { "entropy": 8.519820213317871, "epoch": 0.35297607277041726, "mean_token_accuracy": 0.7638888955116272, "num_tokens": 19620304.0, "step": 3570, "train/ce_loss": 0.6046770811080933 }, { "epoch": 0.35297607277041726, "step": 3570, "train/sim_loss": 0.03125 }, { "epoch": 0.35297607277041726, "step": 3570, "train/total_loss": 0.09171770513057709 }, { "entropy": 9.013934135437012, "epoch": 0.35307494561993275, "mean_token_accuracy": 0.7375271320343018, "num_tokens": 19625818.0, "step": 3571, "train/ce_loss": 0.9548587203025818 }, { "epoch": 0.35307494561993275, "step": 3571, "train/sim_loss": 0.08984375 }, { "epoch": 0.35307494561993275, "step": 3571, "train/total_loss": 0.1853296160697937 }, { "entropy": 8.949447631835938, "epoch": 0.3531738184694483, "mean_token_accuracy": 0.7958179712295532, "num_tokens": 19631412.0, "step": 3572, "train/ce_loss": 0.7205714583396912 }, { "epoch": 0.3531738184694483, "step": 3572, "train/sim_loss": 0.0703125 }, { "epoch": 0.3531738184694483, "step": 3572, "train/total_loss": 0.14236965775489807 }, { "entropy": 8.011788368225098, "epoch": 0.3532726913189638, "mean_token_accuracy": 0.7139319181442261, "num_tokens": 19637554.0, "step": 3573, "train/ce_loss": 0.3577831983566284 }, { "epoch": 0.3532726913189638, "step": 3573, "train/sim_loss": 0.0546875 }, { "epoch": 0.3532726913189638, "step": 3573, "train/total_loss": 0.09046582132577896 }, { "entropy": 8.982519149780273, "epoch": 0.3533715641684793, "mean_token_accuracy": 0.7025527358055115, "num_tokens": 19643131.0, "step": 3574, "train/ce_loss": 1.0383579730987549 }, { "epoch": 0.3533715641684793, "step": 3574, "train/sim_loss": 0.14453125 }, { "epoch": 0.3533715641684793, "step": 3574, "train/total_loss": 0.248367041349411 }, { "entropy": 9.072978973388672, "epoch": 0.35347043701799485, "mean_token_accuracy": 0.7966101765632629, "num_tokens": 19648491.0, "step": 3575, "train/ce_loss": 0.9522057771682739 }, { "epoch": 0.35347043701799485, "step": 3575, "train/sim_loss": 0.06640625 }, { "epoch": 0.35347043701799485, "step": 3575, "train/total_loss": 0.16162683069705963 }, { "entropy": 9.166526794433594, "epoch": 0.3535693098675104, "mean_token_accuracy": 0.7586206793785095, "num_tokens": 19653782.0, "step": 3576, "train/ce_loss": 1.0766963958740234 }, { "epoch": 0.3535693098675104, "step": 3576, "train/sim_loss": 0.1015625 }, { "epoch": 0.3535693098675104, "step": 3576, "train/total_loss": 0.2092321515083313 }, { "entropy": 8.559854507446289, "epoch": 0.3536681827170259, "mean_token_accuracy": 0.793071985244751, "num_tokens": 19659488.0, "step": 3577, "train/ce_loss": 0.38514256477355957 }, { "epoch": 0.3536681827170259, "step": 3577, "train/sim_loss": 0.0546875 }, { "epoch": 0.3536681827170259, "step": 3577, "train/total_loss": 0.09320175647735596 }, { "entropy": 8.91602611541748, "epoch": 0.3537670555665414, "mean_token_accuracy": 0.6998928189277649, "num_tokens": 19665058.0, "step": 3578, "train/ce_loss": 0.8695381283760071 }, { "epoch": 0.3537670555665414, "step": 3578, "train/sim_loss": 0.046875 }, { "epoch": 0.3537670555665414, "step": 3578, "train/total_loss": 0.13382881879806519 }, { "entropy": 8.64703369140625, "epoch": 0.35386592841605696, "mean_token_accuracy": 0.7359667420387268, "num_tokens": 19670680.0, "step": 3579, "train/ce_loss": 0.703058123588562 }, { "epoch": 0.35386592841605696, "step": 3579, "train/sim_loss": 0.046875 }, { "epoch": 0.35386592841605696, "step": 3579, "train/total_loss": 0.11718081682920456 }, { "epoch": 0.35396480126557245, "grad_norm": 0.8482369184494019, "learning_rate": 9.117588883944025e-06, "loss": 0.1398, "step": 3580 }, { "entropy": 9.046793937683105, "epoch": 0.35396480126557245, "mean_token_accuracy": 0.7828054428100586, "num_tokens": 19676177.0, "step": 3580, "train/ce_loss": 0.35725295543670654 }, { "epoch": 0.35396480126557245, "step": 3580, "train/sim_loss": 0.0625 }, { "epoch": 0.35396480126557245, "step": 3580, "train/total_loss": 0.09822529554367065 }, { "entropy": 8.658795356750488, "epoch": 0.354063674115088, "mean_token_accuracy": 0.7803992629051208, "num_tokens": 19681868.0, "step": 3581, "train/ce_loss": 0.33733469247817993 }, { "epoch": 0.354063674115088, "step": 3581, "train/sim_loss": 0.0390625 }, { "epoch": 0.354063674115088, "step": 3581, "train/total_loss": 0.07279597222805023 }, { "entropy": 9.259183883666992, "epoch": 0.35416254696460353, "mean_token_accuracy": 0.738896369934082, "num_tokens": 19687191.0, "step": 3582, "train/ce_loss": 0.6417356133460999 }, { "epoch": 0.35416254696460353, "step": 3582, "train/sim_loss": 0.04296875 }, { "epoch": 0.35416254696460353, "step": 3582, "train/total_loss": 0.10714231431484222 }, { "entropy": 9.01165771484375, "epoch": 0.354261419814119, "mean_token_accuracy": 0.7369668483734131, "num_tokens": 19692613.0, "step": 3583, "train/ce_loss": 0.4299032986164093 }, { "epoch": 0.354261419814119, "step": 3583, "train/sim_loss": 0.01953125 }, { "epoch": 0.354261419814119, "step": 3583, "train/total_loss": 0.06252157688140869 }, { "entropy": 8.889436721801758, "epoch": 0.35436029266363456, "mean_token_accuracy": 0.7754459381103516, "num_tokens": 19698158.0, "step": 3584, "train/ce_loss": 0.4307650625705719 }, { "epoch": 0.35436029266363456, "step": 3584, "train/sim_loss": 0.0234375 }, { "epoch": 0.35436029266363456, "step": 3584, "train/total_loss": 0.06651400774717331 }, { "entropy": 8.739672660827637, "epoch": 0.3544591655131501, "mean_token_accuracy": 0.8183737993240356, "num_tokens": 19703786.0, "step": 3585, "train/ce_loss": 0.5170531272888184 }, { "epoch": 0.3544591655131501, "step": 3585, "train/sim_loss": 0.03125 }, { "epoch": 0.3544591655131501, "step": 3585, "train/total_loss": 0.08295531570911407 }, { "entropy": 8.633721351623535, "epoch": 0.35455803836266564, "mean_token_accuracy": 0.77704918384552, "num_tokens": 19709211.0, "step": 3586, "train/ce_loss": 0.8506359457969666 }, { "epoch": 0.35455803836266564, "step": 3586, "train/sim_loss": 0.06640625 }, { "epoch": 0.35455803836266564, "step": 3586, "train/total_loss": 0.1514698565006256 }, { "entropy": 8.812685012817383, "epoch": 0.3546569112121811, "mean_token_accuracy": 0.7340659499168396, "num_tokens": 19714842.0, "step": 3587, "train/ce_loss": 0.44505739212036133 }, { "epoch": 0.3546569112121811, "step": 3587, "train/sim_loss": 0.11328125 }, { "epoch": 0.3546569112121811, "step": 3587, "train/total_loss": 0.1577869951725006 }, { "entropy": 9.383683204650879, "epoch": 0.35475578406169667, "mean_token_accuracy": 0.7718383073806763, "num_tokens": 19720169.0, "step": 3588, "train/ce_loss": 0.7784637808799744 }, { "epoch": 0.35475578406169667, "step": 3588, "train/sim_loss": 0.07421875 }, { "epoch": 0.35475578406169667, "step": 3588, "train/total_loss": 0.15206512808799744 }, { "entropy": 9.190938949584961, "epoch": 0.3548546569112122, "mean_token_accuracy": 0.7383647561073303, "num_tokens": 19725582.0, "step": 3589, "train/ce_loss": 0.9963865876197815 }, { "epoch": 0.3548546569112122, "step": 3589, "train/sim_loss": 0.0546875 }, { "epoch": 0.3548546569112122, "step": 3589, "train/total_loss": 0.1543261706829071 }, { "entropy": 8.94316291809082, "epoch": 0.3549535297607277, "mean_token_accuracy": 0.7048611044883728, "num_tokens": 19731013.0, "step": 3590, "train/ce_loss": 1.106566071510315 }, { "epoch": 0.3549535297607277, "step": 3590, "train/sim_loss": 0.0390625 }, { "epoch": 0.3549535297607277, "step": 3590, "train/total_loss": 0.14971911907196045 }, { "entropy": 9.128667831420898, "epoch": 0.35505240261024323, "mean_token_accuracy": 0.7592371702194214, "num_tokens": 19736519.0, "step": 3591, "train/ce_loss": 0.7086053490638733 }, { "epoch": 0.35505240261024323, "step": 3591, "train/sim_loss": 0.03515625 }, { "epoch": 0.35505240261024323, "step": 3591, "train/total_loss": 0.10601678490638733 }, { "entropy": 9.4237060546875, "epoch": 0.3551512754597588, "mean_token_accuracy": 0.762390673160553, "num_tokens": 19741748.0, "step": 3592, "train/ce_loss": 0.7978636622428894 }, { "epoch": 0.3551512754597588, "step": 3592, "train/sim_loss": 0.05859375 }, { "epoch": 0.3551512754597588, "step": 3592, "train/total_loss": 0.13838011026382446 }, { "entropy": 8.97769546508789, "epoch": 0.35525014830927426, "mean_token_accuracy": 0.702732264995575, "num_tokens": 19747331.0, "step": 3593, "train/ce_loss": 0.6826847791671753 }, { "epoch": 0.35525014830927426, "step": 3593, "train/sim_loss": 0.12109375 }, { "epoch": 0.35525014830927426, "step": 3593, "train/total_loss": 0.18936222791671753 }, { "entropy": 8.777288436889648, "epoch": 0.3553490211587898, "mean_token_accuracy": 0.7296726703643799, "num_tokens": 19752874.0, "step": 3594, "train/ce_loss": 0.7897566556930542 }, { "epoch": 0.3553490211587898, "step": 3594, "train/sim_loss": 0.078125 }, { "epoch": 0.3553490211587898, "step": 3594, "train/total_loss": 0.15710067749023438 }, { "entropy": 8.85952091217041, "epoch": 0.35544789400830534, "mean_token_accuracy": 0.7038152813911438, "num_tokens": 19758470.0, "step": 3595, "train/ce_loss": 0.4982485771179199 }, { "epoch": 0.35544789400830534, "step": 3595, "train/sim_loss": 0.03515625 }, { "epoch": 0.35544789400830534, "step": 3595, "train/total_loss": 0.08498111367225647 }, { "entropy": 8.68649673461914, "epoch": 0.35554676685782083, "mean_token_accuracy": 0.6943005323410034, "num_tokens": 19764190.0, "step": 3596, "train/ce_loss": 0.6473181843757629 }, { "epoch": 0.35554676685782083, "step": 3596, "train/sim_loss": 0.0390625 }, { "epoch": 0.35554676685782083, "step": 3596, "train/total_loss": 0.10379432141780853 }, { "entropy": 9.330177307128906, "epoch": 0.35564563970733637, "mean_token_accuracy": 0.7313432693481445, "num_tokens": 19769555.0, "step": 3597, "train/ce_loss": 1.0591461658477783 }, { "epoch": 0.35564563970733637, "step": 3597, "train/sim_loss": 0.0625 }, { "epoch": 0.35564563970733637, "step": 3597, "train/total_loss": 0.1684146225452423 }, { "entropy": 8.593873977661133, "epoch": 0.3557445125568519, "mean_token_accuracy": 0.7280898690223694, "num_tokens": 19775070.0, "step": 3598, "train/ce_loss": 1.2873096466064453 }, { "epoch": 0.3557445125568519, "step": 3598, "train/sim_loss": 0.078125 }, { "epoch": 0.3557445125568519, "step": 3598, "train/total_loss": 0.20685596764087677 }, { "entropy": 8.880512237548828, "epoch": 0.3558433854063674, "mean_token_accuracy": 0.796407163143158, "num_tokens": 19780669.0, "step": 3599, "train/ce_loss": 0.6618034243583679 }, { "epoch": 0.3558433854063674, "step": 3599, "train/sim_loss": 0.0390625 }, { "epoch": 0.3558433854063674, "step": 3599, "train/total_loss": 0.10524284094572067 }, { "epoch": 0.35594225825588294, "grad_norm": 0.6375426650047302, "learning_rate": 9.112644019186077e-06, "loss": 0.1374, "step": 3600 }, { "entropy": 9.012710571289062, "epoch": 0.35594225825588294, "mean_token_accuracy": 0.8024149537086487, "num_tokens": 19786173.0, "step": 3600, "train/ce_loss": 0.47707700729370117 }, { "epoch": 0.35594225825588294, "step": 3600, "train/sim_loss": 0.0390625 }, { "epoch": 0.35594225825588294, "step": 3600, "train/total_loss": 0.0867702066898346 }, { "entropy": 8.969949722290039, "epoch": 0.3560411311053985, "mean_token_accuracy": 0.7922350764274597, "num_tokens": 19791757.0, "step": 3601, "train/ce_loss": 0.6900349259376526 }, { "epoch": 0.3560411311053985, "step": 3601, "train/sim_loss": 0.015625 }, { "epoch": 0.3560411311053985, "step": 3601, "train/total_loss": 0.08462849259376526 }, { "entropy": 8.976509094238281, "epoch": 0.35614000395491396, "mean_token_accuracy": 0.7235682606697083, "num_tokens": 19797271.0, "step": 3602, "train/ce_loss": 0.5867919325828552 }, { "epoch": 0.35614000395491396, "step": 3602, "train/sim_loss": 0.046875 }, { "epoch": 0.35614000395491396, "step": 3602, "train/total_loss": 0.10555419325828552 }, { "entropy": 8.791834831237793, "epoch": 0.3562388768044295, "mean_token_accuracy": 0.7035132646560669, "num_tokens": 19803038.0, "step": 3603, "train/ce_loss": 0.6046474575996399 }, { "epoch": 0.3562388768044295, "step": 3603, "train/sim_loss": 0.08203125 }, { "epoch": 0.3562388768044295, "step": 3603, "train/total_loss": 0.1424959897994995 }, { "entropy": 8.961322784423828, "epoch": 0.35633774965394505, "mean_token_accuracy": 0.7503030300140381, "num_tokens": 19808516.0, "step": 3604, "train/ce_loss": 0.7700955867767334 }, { "epoch": 0.35633774965394505, "step": 3604, "train/sim_loss": 0.0625 }, { "epoch": 0.35633774965394505, "step": 3604, "train/total_loss": 0.13950955867767334 }, { "entropy": 8.997053146362305, "epoch": 0.35643662250346053, "mean_token_accuracy": 0.7925840020179749, "num_tokens": 19813912.0, "step": 3605, "train/ce_loss": 0.4950610399246216 }, { "epoch": 0.35643662250346053, "step": 3605, "train/sim_loss": 0.0234375 }, { "epoch": 0.35643662250346053, "step": 3605, "train/total_loss": 0.07294360548257828 }, { "entropy": 8.359704971313477, "epoch": 0.3565354953529761, "mean_token_accuracy": 0.7508196830749512, "num_tokens": 19819717.0, "step": 3606, "train/ce_loss": 0.28353703022003174 }, { "epoch": 0.3565354953529761, "step": 3606, "train/sim_loss": 0.09375 }, { "epoch": 0.3565354953529761, "step": 3606, "train/total_loss": 0.12210370600223541 }, { "entropy": 9.151616096496582, "epoch": 0.3566343682024916, "mean_token_accuracy": 0.7659313678741455, "num_tokens": 19825177.0, "step": 3607, "train/ce_loss": 0.4120355248451233 }, { "epoch": 0.3566343682024916, "step": 3607, "train/sim_loss": 0.046875 }, { "epoch": 0.3566343682024916, "step": 3607, "train/total_loss": 0.0880785584449768 }, { "entropy": 9.046510696411133, "epoch": 0.3567332410520071, "mean_token_accuracy": 0.7249070405960083, "num_tokens": 19830649.0, "step": 3608, "train/ce_loss": 1.0359292030334473 }, { "epoch": 0.3567332410520071, "step": 3608, "train/sim_loss": 0.11328125 }, { "epoch": 0.3567332410520071, "step": 3608, "train/total_loss": 0.21687418222427368 }, { "entropy": 8.969316482543945, "epoch": 0.35683211390152264, "mean_token_accuracy": 0.729567289352417, "num_tokens": 19836098.0, "step": 3609, "train/ce_loss": 1.2144824266433716 }, { "epoch": 0.35683211390152264, "step": 3609, "train/sim_loss": 0.0703125 }, { "epoch": 0.35683211390152264, "step": 3609, "train/total_loss": 0.19176074862480164 }, { "entropy": 8.963048934936523, "epoch": 0.3569309867510382, "mean_token_accuracy": 0.7431865930557251, "num_tokens": 19841689.0, "step": 3610, "train/ce_loss": 0.8100427389144897 }, { "epoch": 0.3569309867510382, "step": 3610, "train/sim_loss": 0.11328125 }, { "epoch": 0.3569309867510382, "step": 3610, "train/total_loss": 0.1942855268716812 }, { "entropy": 8.87495231628418, "epoch": 0.35702985960055367, "mean_token_accuracy": 0.7289377450942993, "num_tokens": 19847122.0, "step": 3611, "train/ce_loss": 0.7933953404426575 }, { "epoch": 0.35702985960055367, "step": 3611, "train/sim_loss": 0.1015625 }, { "epoch": 0.35702985960055367, "step": 3611, "train/total_loss": 0.18090203404426575 }, { "entropy": 9.242395401000977, "epoch": 0.3571287324500692, "mean_token_accuracy": 0.7928388714790344, "num_tokens": 19852515.0, "step": 3612, "train/ce_loss": 0.5641875267028809 }, { "epoch": 0.3571287324500692, "step": 3612, "train/sim_loss": 0.0546875 }, { "epoch": 0.3571287324500692, "step": 3612, "train/total_loss": 0.1111062541604042 }, { "entropy": 9.038013458251953, "epoch": 0.35722760529958475, "mean_token_accuracy": 0.7677419185638428, "num_tokens": 19858063.0, "step": 3613, "train/ce_loss": 0.5378498435020447 }, { "epoch": 0.35722760529958475, "step": 3613, "train/sim_loss": 0.12890625 }, { "epoch": 0.35722760529958475, "step": 3613, "train/total_loss": 0.18269123136997223 }, { "entropy": 9.122387886047363, "epoch": 0.35732647814910024, "mean_token_accuracy": 0.688249409198761, "num_tokens": 19863476.0, "step": 3614, "train/ce_loss": 1.7329460382461548 }, { "epoch": 0.35732647814910024, "step": 3614, "train/sim_loss": 0.1171875 }, { "epoch": 0.35732647814910024, "step": 3614, "train/total_loss": 0.2904821038246155 }, { "entropy": 8.87392807006836, "epoch": 0.3574253509986158, "mean_token_accuracy": 0.7553418874740601, "num_tokens": 19869039.0, "step": 3615, "train/ce_loss": 1.007181167602539 }, { "epoch": 0.3574253509986158, "step": 3615, "train/sim_loss": 0.1328125 }, { "epoch": 0.3574253509986158, "step": 3615, "train/total_loss": 0.23353061079978943 }, { "entropy": 9.464313507080078, "epoch": 0.3575242238481313, "mean_token_accuracy": 0.7719298005104065, "num_tokens": 19874303.0, "step": 3616, "train/ce_loss": 0.4526442587375641 }, { "epoch": 0.3575242238481313, "step": 3616, "train/sim_loss": 0.078125 }, { "epoch": 0.3575242238481313, "step": 3616, "train/total_loss": 0.12338942289352417 }, { "entropy": 8.788362503051758, "epoch": 0.3576230966976468, "mean_token_accuracy": 0.7542628049850464, "num_tokens": 19879948.0, "step": 3617, "train/ce_loss": 0.24057941138744354 }, { "epoch": 0.3576230966976468, "step": 3617, "train/sim_loss": 0.0234375 }, { "epoch": 0.3576230966976468, "step": 3617, "train/total_loss": 0.047495439648628235 }, { "entropy": 8.940460205078125, "epoch": 0.35772196954716234, "mean_token_accuracy": 0.8125665783882141, "num_tokens": 19885545.0, "step": 3618, "train/ce_loss": 0.5468629002571106 }, { "epoch": 0.35772196954716234, "step": 3618, "train/sim_loss": 0.02734375 }, { "epoch": 0.35772196954716234, "step": 3618, "train/total_loss": 0.0820300430059433 }, { "entropy": 9.145745277404785, "epoch": 0.3578208423966779, "mean_token_accuracy": 0.7816349267959595, "num_tokens": 19891009.0, "step": 3619, "train/ce_loss": 0.6798914670944214 }, { "epoch": 0.3578208423966779, "step": 3619, "train/sim_loss": 0.046875 }, { "epoch": 0.3578208423966779, "step": 3619, "train/total_loss": 0.11486414819955826 }, { "epoch": 0.35791971524619337, "grad_norm": 0.5745992660522461, "learning_rate": 9.107699154428126e-06, "loss": 0.1359, "step": 3620 }, { "entropy": 9.0496244430542, "epoch": 0.35791971524619337, "mean_token_accuracy": 0.7608142495155334, "num_tokens": 19896437.0, "step": 3620, "train/ce_loss": 0.7780752182006836 }, { "epoch": 0.35791971524619337, "step": 3620, "train/sim_loss": 0.0546875 }, { "epoch": 0.35791971524619337, "step": 3620, "train/total_loss": 0.13249501585960388 }, { "entropy": 8.956603050231934, "epoch": 0.3580185880957089, "mean_token_accuracy": 0.6952381134033203, "num_tokens": 19902126.0, "step": 3621, "train/ce_loss": 0.6233916878700256 }, { "epoch": 0.3580185880957089, "step": 3621, "train/sim_loss": 0.1015625 }, { "epoch": 0.3580185880957089, "step": 3621, "train/total_loss": 0.1639016717672348 }, { "entropy": 9.094499588012695, "epoch": 0.35811746094522445, "mean_token_accuracy": 0.74301677942276, "num_tokens": 19907598.0, "step": 3622, "train/ce_loss": 0.9564700722694397 }, { "epoch": 0.35811746094522445, "step": 3622, "train/sim_loss": 0.0703125 }, { "epoch": 0.35811746094522445, "step": 3622, "train/total_loss": 0.16595950722694397 }, { "entropy": 8.991744995117188, "epoch": 0.35821633379473994, "mean_token_accuracy": 0.7101010084152222, "num_tokens": 19913265.0, "step": 3623, "train/ce_loss": 0.7366756796836853 }, { "epoch": 0.35821633379473994, "step": 3623, "train/sim_loss": 0.05859375 }, { "epoch": 0.35821633379473994, "step": 3623, "train/total_loss": 0.13226132094860077 }, { "entropy": 9.003870010375977, "epoch": 0.3583152066442555, "mean_token_accuracy": 0.7063882350921631, "num_tokens": 19918669.0, "step": 3624, "train/ce_loss": 0.6977971792221069 }, { "epoch": 0.3583152066442555, "step": 3624, "train/sim_loss": 0.07421875 }, { "epoch": 0.3583152066442555, "step": 3624, "train/total_loss": 0.14399847388267517 }, { "entropy": 8.623434066772461, "epoch": 0.358414079493771, "mean_token_accuracy": 0.7557603716850281, "num_tokens": 19924406.0, "step": 3625, "train/ce_loss": 1.2855256795883179 }, { "epoch": 0.358414079493771, "step": 3625, "train/sim_loss": 0.08984375 }, { "epoch": 0.358414079493771, "step": 3625, "train/total_loss": 0.21839632093906403 }, { "entropy": 9.095728874206543, "epoch": 0.3585129523432865, "mean_token_accuracy": 0.7603878378868103, "num_tokens": 19929752.0, "step": 3626, "train/ce_loss": 0.5364015698432922 }, { "epoch": 0.3585129523432865, "step": 3626, "train/sim_loss": 0.0625 }, { "epoch": 0.3585129523432865, "step": 3626, "train/total_loss": 0.11614015698432922 }, { "entropy": 8.807661056518555, "epoch": 0.35861182519280205, "mean_token_accuracy": 0.7211328744888306, "num_tokens": 19935310.0, "step": 3627, "train/ce_loss": 0.7192766070365906 }, { "epoch": 0.35861182519280205, "step": 3627, "train/sim_loss": 0.09375 }, { "epoch": 0.35861182519280205, "step": 3627, "train/total_loss": 0.16567766666412354 }, { "entropy": 8.864496231079102, "epoch": 0.3587106980423176, "mean_token_accuracy": 0.8305647969245911, "num_tokens": 19940856.0, "step": 3628, "train/ce_loss": 0.5975772738456726 }, { "epoch": 0.3587106980423176, "step": 3628, "train/sim_loss": 0.06640625 }, { "epoch": 0.3587106980423176, "step": 3628, "train/total_loss": 0.12616397440433502 }, { "entropy": 8.783416748046875, "epoch": 0.35880957089183313, "mean_token_accuracy": 0.7906976938247681, "num_tokens": 19946473.0, "step": 3629, "train/ce_loss": 0.5455293655395508 }, { "epoch": 0.35880957089183313, "step": 3629, "train/sim_loss": 0.0390625 }, { "epoch": 0.35880957089183313, "step": 3629, "train/total_loss": 0.09361544251441956 }, { "entropy": 9.014535903930664, "epoch": 0.3589084437413486, "mean_token_accuracy": 0.720441997051239, "num_tokens": 19951948.0, "step": 3630, "train/ce_loss": 0.8385223746299744 }, { "epoch": 0.3589084437413486, "step": 3630, "train/sim_loss": 0.0546875 }, { "epoch": 0.3589084437413486, "step": 3630, "train/total_loss": 0.13853973150253296 }, { "entropy": 8.887271881103516, "epoch": 0.35900731659086416, "mean_token_accuracy": 0.7448512315750122, "num_tokens": 19957362.0, "step": 3631, "train/ce_loss": 0.6038448214530945 }, { "epoch": 0.35900731659086416, "step": 3631, "train/sim_loss": 0.03515625 }, { "epoch": 0.35900731659086416, "step": 3631, "train/total_loss": 0.09554073214530945 }, { "entropy": 8.95705795288086, "epoch": 0.3591061894403797, "mean_token_accuracy": 0.7461873888969421, "num_tokens": 19962933.0, "step": 3632, "train/ce_loss": 1.1903613805770874 }, { "epoch": 0.3591061894403797, "step": 3632, "train/sim_loss": 0.04296875 }, { "epoch": 0.3591061894403797, "step": 3632, "train/total_loss": 0.16200488805770874 }, { "entropy": 8.801412582397461, "epoch": 0.3592050622898952, "mean_token_accuracy": 0.7269906997680664, "num_tokens": 19968556.0, "step": 3633, "train/ce_loss": 1.1040678024291992 }, { "epoch": 0.3592050622898952, "step": 3633, "train/sim_loss": 0.11328125 }, { "epoch": 0.3592050622898952, "step": 3633, "train/total_loss": 0.2236880362033844 }, { "entropy": 9.040935516357422, "epoch": 0.3593039351394107, "mean_token_accuracy": 0.7457420825958252, "num_tokens": 19973920.0, "step": 3634, "train/ce_loss": 0.9278399348258972 }, { "epoch": 0.3593039351394107, "step": 3634, "train/sim_loss": 0.06640625 }, { "epoch": 0.3593039351394107, "step": 3634, "train/total_loss": 0.15919023752212524 }, { "entropy": 8.697087287902832, "epoch": 0.35940280798892627, "mean_token_accuracy": 0.6895459294319153, "num_tokens": 19979500.0, "step": 3635, "train/ce_loss": 0.7621396780014038 }, { "epoch": 0.35940280798892627, "step": 3635, "train/sim_loss": 0.0390625 }, { "epoch": 0.35940280798892627, "step": 3635, "train/total_loss": 0.11527647078037262 }, { "entropy": 9.104219436645508, "epoch": 0.35950168083844175, "mean_token_accuracy": 0.7801324725151062, "num_tokens": 19984894.0, "step": 3636, "train/ce_loss": 0.5207947492599487 }, { "epoch": 0.35950168083844175, "step": 3636, "train/sim_loss": 0.05859375 }, { "epoch": 0.35950168083844175, "step": 3636, "train/total_loss": 0.11067322641611099 }, { "entropy": 9.091957092285156, "epoch": 0.3596005536879573, "mean_token_accuracy": 0.7135802507400513, "num_tokens": 19990300.0, "step": 3637, "train/ce_loss": 0.4836645722389221 }, { "epoch": 0.3596005536879573, "step": 3637, "train/sim_loss": 0.02734375 }, { "epoch": 0.3596005536879573, "step": 3637, "train/total_loss": 0.07571020722389221 }, { "entropy": 9.267552375793457, "epoch": 0.35969942653747283, "mean_token_accuracy": 0.7647058963775635, "num_tokens": 19995576.0, "step": 3638, "train/ce_loss": 0.6979020833969116 }, { "epoch": 0.35969942653747283, "step": 3638, "train/sim_loss": 0.05078125 }, { "epoch": 0.35969942653747283, "step": 3638, "train/total_loss": 0.12057145684957504 }, { "entropy": 8.912626266479492, "epoch": 0.3597982993869883, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 20000991.0, "step": 3639, "train/ce_loss": 0.5679906010627747 }, { "epoch": 0.3597982993869883, "step": 3639, "train/sim_loss": 0.05078125 }, { "epoch": 0.3597982993869883, "step": 3639, "train/total_loss": 0.10758031159639359 }, { "epoch": 0.35989717223650386, "grad_norm": 0.6485537886619568, "learning_rate": 9.102754289670178e-06, "loss": 0.143, "step": 3640 }, { "entropy": 9.184410095214844, "epoch": 0.35989717223650386, "mean_token_accuracy": 0.7523923516273499, "num_tokens": 20006322.0, "step": 3640, "train/ce_loss": 0.8359951972961426 }, { "epoch": 0.35989717223650386, "step": 3640, "train/sim_loss": 0.0546875 }, { "epoch": 0.35989717223650386, "step": 3640, "train/total_loss": 0.1382870227098465 }, { "entropy": 9.047425270080566, "epoch": 0.3599960450860194, "mean_token_accuracy": 0.7497048377990723, "num_tokens": 20011771.0, "step": 3641, "train/ce_loss": 0.4637411832809448 }, { "epoch": 0.3599960450860194, "step": 3641, "train/sim_loss": 0.0625 }, { "epoch": 0.3599960450860194, "step": 3641, "train/total_loss": 0.1088741198182106 }, { "entropy": 8.895525932312012, "epoch": 0.3600949179355349, "mean_token_accuracy": 0.7599118947982788, "num_tokens": 20017174.0, "step": 3642, "train/ce_loss": 0.7936043739318848 }, { "epoch": 0.3600949179355349, "step": 3642, "train/sim_loss": 0.01953125 }, { "epoch": 0.3600949179355349, "step": 3642, "train/total_loss": 0.09889169037342072 }, { "entropy": 9.018447875976562, "epoch": 0.3601937907850504, "mean_token_accuracy": 0.7452380657196045, "num_tokens": 20022675.0, "step": 3643, "train/ce_loss": 1.1563347578048706 }, { "epoch": 0.3601937907850504, "step": 3643, "train/sim_loss": 0.08984375 }, { "epoch": 0.3601937907850504, "step": 3643, "train/total_loss": 0.20547723770141602 }, { "entropy": 8.517647743225098, "epoch": 0.36029266363456597, "mean_token_accuracy": 0.720233142375946, "num_tokens": 20028502.0, "step": 3644, "train/ce_loss": 1.6341502666473389 }, { "epoch": 0.36029266363456597, "step": 3644, "train/sim_loss": 0.0546875 }, { "epoch": 0.36029266363456597, "step": 3644, "train/total_loss": 0.21810252964496613 }, { "entropy": 9.207710266113281, "epoch": 0.36039153648408145, "mean_token_accuracy": 0.6927152276039124, "num_tokens": 20033917.0, "step": 3645, "train/ce_loss": 0.4507928788661957 }, { "epoch": 0.36039153648408145, "step": 3645, "train/sim_loss": 0.0703125 }, { "epoch": 0.36039153648408145, "step": 3645, "train/total_loss": 0.1153917908668518 }, { "entropy": 9.065241813659668, "epoch": 0.360490409333597, "mean_token_accuracy": 0.7557355165481567, "num_tokens": 20039319.0, "step": 3646, "train/ce_loss": 1.0824849605560303 }, { "epoch": 0.360490409333597, "step": 3646, "train/sim_loss": 0.046875 }, { "epoch": 0.360490409333597, "step": 3646, "train/total_loss": 0.1551235020160675 }, { "entropy": 9.019176483154297, "epoch": 0.36058928218311254, "mean_token_accuracy": 0.7108141183853149, "num_tokens": 20044730.0, "step": 3647, "train/ce_loss": 1.0326341390609741 }, { "epoch": 0.36058928218311254, "step": 3647, "train/sim_loss": 0.0546875 }, { "epoch": 0.36058928218311254, "step": 3647, "train/total_loss": 0.15795090794563293 }, { "entropy": 8.653450965881348, "epoch": 0.360688155032628, "mean_token_accuracy": 0.7447817921638489, "num_tokens": 20050372.0, "step": 3648, "train/ce_loss": 0.7536072731018066 }, { "epoch": 0.360688155032628, "step": 3648, "train/sim_loss": 0.05859375 }, { "epoch": 0.360688155032628, "step": 3648, "train/total_loss": 0.1339544802904129 }, { "entropy": 8.90721607208252, "epoch": 0.36078702788214356, "mean_token_accuracy": 0.7713950872421265, "num_tokens": 20055770.0, "step": 3649, "train/ce_loss": 0.6003490686416626 }, { "epoch": 0.36078702788214356, "step": 3649, "train/sim_loss": 0.046875 }, { "epoch": 0.36078702788214356, "step": 3649, "train/total_loss": 0.10690990835428238 }, { "entropy": 9.075336456298828, "epoch": 0.3608859007316591, "mean_token_accuracy": 0.7583001255989075, "num_tokens": 20061161.0, "step": 3650, "train/ce_loss": 0.8070318102836609 }, { "epoch": 0.3608859007316591, "step": 3650, "train/sim_loss": 0.03515625 }, { "epoch": 0.3608859007316591, "step": 3650, "train/total_loss": 0.11585943400859833 }, { "entropy": 8.948091506958008, "epoch": 0.3609847735811746, "mean_token_accuracy": 0.7587719559669495, "num_tokens": 20066625.0, "step": 3651, "train/ce_loss": 0.8232377767562866 }, { "epoch": 0.3609847735811746, "step": 3651, "train/sim_loss": 0.07421875 }, { "epoch": 0.3609847735811746, "step": 3651, "train/total_loss": 0.15654253959655762 }, { "entropy": 8.734739303588867, "epoch": 0.36108364643069013, "mean_token_accuracy": 0.7634285688400269, "num_tokens": 20072163.0, "step": 3652, "train/ce_loss": 0.6687564253807068 }, { "epoch": 0.36108364643069013, "step": 3652, "train/sim_loss": 0.03125 }, { "epoch": 0.36108364643069013, "step": 3652, "train/total_loss": 0.0981256440281868 }, { "entropy": 9.289871215820312, "epoch": 0.36118251928020567, "mean_token_accuracy": 0.7477360963821411, "num_tokens": 20077517.0, "step": 3653, "train/ce_loss": 0.8850408792495728 }, { "epoch": 0.36118251928020567, "step": 3653, "train/sim_loss": 0.078125 }, { "epoch": 0.36118251928020567, "step": 3653, "train/total_loss": 0.16662909090518951 }, { "entropy": 9.25664234161377, "epoch": 0.36128139212972116, "mean_token_accuracy": 0.7197802066802979, "num_tokens": 20082883.0, "step": 3654, "train/ce_loss": 0.7877568006515503 }, { "epoch": 0.36128139212972116, "step": 3654, "train/sim_loss": 0.06640625 }, { "epoch": 0.36128139212972116, "step": 3654, "train/total_loss": 0.14518192410469055 }, { "entropy": 8.994457244873047, "epoch": 0.3613802649792367, "mean_token_accuracy": 0.7481203079223633, "num_tokens": 20088303.0, "step": 3655, "train/ce_loss": 1.197709083557129 }, { "epoch": 0.3613802649792367, "step": 3655, "train/sim_loss": 0.07421875 }, { "epoch": 0.3613802649792367, "step": 3655, "train/total_loss": 0.19398966431617737 }, { "entropy": 8.85307502746582, "epoch": 0.36147913782875224, "mean_token_accuracy": 0.7788671255111694, "num_tokens": 20093897.0, "step": 3656, "train/ce_loss": 0.7613956332206726 }, { "epoch": 0.36147913782875224, "step": 3656, "train/sim_loss": 0.06640625 }, { "epoch": 0.36147913782875224, "step": 3656, "train/total_loss": 0.14254581928253174 }, { "entropy": 8.922426223754883, "epoch": 0.3615780106782677, "mean_token_accuracy": 0.7436517477035522, "num_tokens": 20099335.0, "step": 3657, "train/ce_loss": 0.6588636636734009 }, { "epoch": 0.3615780106782677, "step": 3657, "train/sim_loss": 0.046875 }, { "epoch": 0.3615780106782677, "step": 3657, "train/total_loss": 0.11276137083768845 }, { "entropy": 9.375631332397461, "epoch": 0.36167688352778327, "mean_token_accuracy": 0.7736156582832336, "num_tokens": 20104539.0, "step": 3658, "train/ce_loss": 0.8283042311668396 }, { "epoch": 0.36167688352778327, "step": 3658, "train/sim_loss": 0.109375 }, { "epoch": 0.36167688352778327, "step": 3658, "train/total_loss": 0.19220542907714844 }, { "entropy": 8.873298645019531, "epoch": 0.3617757563772988, "mean_token_accuracy": 0.7956989407539368, "num_tokens": 20110067.0, "step": 3659, "train/ce_loss": 0.755554735660553 }, { "epoch": 0.3617757563772988, "step": 3659, "train/sim_loss": 0.02734375 }, { "epoch": 0.3617757563772988, "step": 3659, "train/total_loss": 0.1028992235660553 }, { "epoch": 0.3618746292268143, "grad_norm": 0.6949535608291626, "learning_rate": 9.097809424912229e-06, "loss": 0.1416, "step": 3660 }, { "entropy": 8.584820747375488, "epoch": 0.3618746292268143, "mean_token_accuracy": 0.7216147780418396, "num_tokens": 20115868.0, "step": 3660, "train/ce_loss": 0.5722332000732422 }, { "epoch": 0.3618746292268143, "step": 3660, "train/sim_loss": 0.0859375 }, { "epoch": 0.3618746292268143, "step": 3660, "train/total_loss": 0.14316082000732422 }, { "entropy": 9.017902374267578, "epoch": 0.36197350207632983, "mean_token_accuracy": 0.781333327293396, "num_tokens": 20121276.0, "step": 3661, "train/ce_loss": 0.6142660975456238 }, { "epoch": 0.36197350207632983, "step": 3661, "train/sim_loss": 0.08203125 }, { "epoch": 0.36197350207632983, "step": 3661, "train/total_loss": 0.14345785975456238 }, { "entropy": 8.88705825805664, "epoch": 0.3620723749258454, "mean_token_accuracy": 0.7767441868782043, "num_tokens": 20126779.0, "step": 3662, "train/ce_loss": 0.5184643864631653 }, { "epoch": 0.3620723749258454, "step": 3662, "train/sim_loss": 0.02734375 }, { "epoch": 0.3620723749258454, "step": 3662, "train/total_loss": 0.079190194606781 }, { "entropy": 8.670537948608398, "epoch": 0.36217124777536086, "mean_token_accuracy": 0.7043558955192566, "num_tokens": 20132438.0, "step": 3663, "train/ce_loss": 1.9623322486877441 }, { "epoch": 0.36217124777536086, "step": 3663, "train/sim_loss": 0.0625 }, { "epoch": 0.36217124777536086, "step": 3663, "train/total_loss": 0.25873321294784546 }, { "entropy": 9.085588455200195, "epoch": 0.3622701206248764, "mean_token_accuracy": 0.7108843326568604, "num_tokens": 20137968.0, "step": 3664, "train/ce_loss": 0.9422558546066284 }, { "epoch": 0.3622701206248764, "step": 3664, "train/sim_loss": 0.0703125 }, { "epoch": 0.3622701206248764, "step": 3664, "train/total_loss": 0.16453808546066284 }, { "entropy": 9.017799377441406, "epoch": 0.36236899347439194, "mean_token_accuracy": 0.7361751198768616, "num_tokens": 20143409.0, "step": 3665, "train/ce_loss": 1.4660276174545288 }, { "epoch": 0.36236899347439194, "step": 3665, "train/sim_loss": 0.05859375 }, { "epoch": 0.36236899347439194, "step": 3665, "train/total_loss": 0.20519651472568512 }, { "entropy": 8.707371711730957, "epoch": 0.36246786632390743, "mean_token_accuracy": 0.7537118196487427, "num_tokens": 20149082.0, "step": 3666, "train/ce_loss": 1.1021684408187866 }, { "epoch": 0.36246786632390743, "step": 3666, "train/sim_loss": 0.07421875 }, { "epoch": 0.36246786632390743, "step": 3666, "train/total_loss": 0.18443560600280762 }, { "entropy": 8.454755783081055, "epoch": 0.36256673917342297, "mean_token_accuracy": 0.779082179069519, "num_tokens": 20154732.0, "step": 3667, "train/ce_loss": 0.8464989066123962 }, { "epoch": 0.36256673917342297, "step": 3667, "train/sim_loss": 0.0703125 }, { "epoch": 0.36256673917342297, "step": 3667, "train/total_loss": 0.15496239066123962 }, { "entropy": 8.906429290771484, "epoch": 0.3626656120229385, "mean_token_accuracy": 0.6938534379005432, "num_tokens": 20160151.0, "step": 3668, "train/ce_loss": 0.5735172629356384 }, { "epoch": 0.3626656120229385, "step": 3668, "train/sim_loss": 0.05859375 }, { "epoch": 0.3626656120229385, "step": 3668, "train/total_loss": 0.1159454733133316 }, { "entropy": 8.77876091003418, "epoch": 0.36276448487245405, "mean_token_accuracy": 0.7639751434326172, "num_tokens": 20165751.0, "step": 3669, "train/ce_loss": 0.6163437366485596 }, { "epoch": 0.36276448487245405, "step": 3669, "train/sim_loss": 0.015625 }, { "epoch": 0.36276448487245405, "step": 3669, "train/total_loss": 0.0772593766450882 }, { "entropy": 8.926219940185547, "epoch": 0.36286335772196954, "mean_token_accuracy": 0.7865044474601746, "num_tokens": 20171266.0, "step": 3670, "train/ce_loss": 0.646302342414856 }, { "epoch": 0.36286335772196954, "step": 3670, "train/sim_loss": 0.04296875 }, { "epoch": 0.36286335772196954, "step": 3670, "train/total_loss": 0.10759898275136948 }, { "entropy": 9.255290031433105, "epoch": 0.3629622305714851, "mean_token_accuracy": 0.7478474974632263, "num_tokens": 20176705.0, "step": 3671, "train/ce_loss": 1.1192153692245483 }, { "epoch": 0.3629622305714851, "step": 3671, "train/sim_loss": 0.06640625 }, { "epoch": 0.3629622305714851, "step": 3671, "train/total_loss": 0.1783277988433838 }, { "entropy": 9.062845230102539, "epoch": 0.3630611034210006, "mean_token_accuracy": 0.7856273055076599, "num_tokens": 20182164.0, "step": 3672, "train/ce_loss": 0.6006273627281189 }, { "epoch": 0.3630611034210006, "step": 3672, "train/sim_loss": 0.078125 }, { "epoch": 0.3630611034210006, "step": 3672, "train/total_loss": 0.1381877362728119 }, { "entropy": 9.104019165039062, "epoch": 0.3631599762705161, "mean_token_accuracy": 0.7259158492088318, "num_tokens": 20187543.0, "step": 3673, "train/ce_loss": 1.0967262983322144 }, { "epoch": 0.3631599762705161, "step": 3673, "train/sim_loss": 0.125 }, { "epoch": 0.3631599762705161, "step": 3673, "train/total_loss": 0.2346726357936859 }, { "entropy": 9.133010864257812, "epoch": 0.36325884912003165, "mean_token_accuracy": 0.6784741282463074, "num_tokens": 20192838.0, "step": 3674, "train/ce_loss": 1.022828459739685 }, { "epoch": 0.36325884912003165, "step": 3674, "train/sim_loss": 0.0703125 }, { "epoch": 0.36325884912003165, "step": 3674, "train/total_loss": 0.17259535193443298 }, { "entropy": 8.96641731262207, "epoch": 0.3633577219695472, "mean_token_accuracy": 0.7253270149230957, "num_tokens": 20198303.0, "step": 3675, "train/ce_loss": 0.84723961353302 }, { "epoch": 0.3633577219695472, "step": 3675, "train/sim_loss": 0.07421875 }, { "epoch": 0.3633577219695472, "step": 3675, "train/total_loss": 0.15894271433353424 }, { "entropy": 9.372337341308594, "epoch": 0.3634565948190627, "mean_token_accuracy": 0.6855172514915466, "num_tokens": 20203630.0, "step": 3676, "train/ce_loss": 0.9439517259597778 }, { "epoch": 0.3634565948190627, "step": 3676, "train/sim_loss": 0.0859375 }, { "epoch": 0.3634565948190627, "step": 3676, "train/total_loss": 0.18033267557621002 }, { "entropy": 8.785751342773438, "epoch": 0.3635554676685782, "mean_token_accuracy": 0.7677778005599976, "num_tokens": 20209148.0, "step": 3677, "train/ce_loss": 1.00545072555542 }, { "epoch": 0.3635554676685782, "step": 3677, "train/sim_loss": 0.07421875 }, { "epoch": 0.3635554676685782, "step": 3677, "train/total_loss": 0.17476382851600647 }, { "entropy": 8.81253719329834, "epoch": 0.36365434051809375, "mean_token_accuracy": 0.7855072617530823, "num_tokens": 20214790.0, "step": 3678, "train/ce_loss": 0.9052372574806213 }, { "epoch": 0.36365434051809375, "step": 3678, "train/sim_loss": 0.0859375 }, { "epoch": 0.36365434051809375, "step": 3678, "train/total_loss": 0.17646121978759766 }, { "entropy": 9.130454063415527, "epoch": 0.36375321336760924, "mean_token_accuracy": 0.7436241507530212, "num_tokens": 20220189.0, "step": 3679, "train/ce_loss": 0.46336522698402405 }, { "epoch": 0.36375321336760924, "step": 3679, "train/sim_loss": 0.015625 }, { "epoch": 0.36375321336760924, "step": 3679, "train/total_loss": 0.061961524188518524 }, { "epoch": 0.3638520862171248, "grad_norm": 0.7456504106521606, "learning_rate": 9.092864560154281e-06, "loss": 0.1414, "step": 3680 }, { "entropy": 8.810253143310547, "epoch": 0.3638520862171248, "mean_token_accuracy": 0.7327935099601746, "num_tokens": 20226000.0, "step": 3680, "train/ce_loss": 0.37579602003097534 }, { "epoch": 0.3638520862171248, "step": 3680, "train/sim_loss": 0.06640625 }, { "epoch": 0.3638520862171248, "step": 3680, "train/total_loss": 0.10398585349321365 }, { "entropy": 8.914579391479492, "epoch": 0.3639509590666403, "mean_token_accuracy": 0.7494252920150757, "num_tokens": 20231419.0, "step": 3681, "train/ce_loss": 0.9510838389396667 }, { "epoch": 0.3639509590666403, "step": 3681, "train/sim_loss": 0.0390625 }, { "epoch": 0.3639509590666403, "step": 3681, "train/total_loss": 0.13417088985443115 }, { "entropy": 8.785409927368164, "epoch": 0.3640498319161558, "mean_token_accuracy": 0.774319052696228, "num_tokens": 20237037.0, "step": 3682, "train/ce_loss": 0.8004626631736755 }, { "epoch": 0.3640498319161558, "step": 3682, "train/sim_loss": 0.02734375 }, { "epoch": 0.3640498319161558, "step": 3682, "train/total_loss": 0.10739001631736755 }, { "entropy": 9.130671501159668, "epoch": 0.36414870476567135, "mean_token_accuracy": 0.7563587427139282, "num_tokens": 20242430.0, "step": 3683, "train/ce_loss": 0.8666002154350281 }, { "epoch": 0.36414870476567135, "step": 3683, "train/sim_loss": 0.078125 }, { "epoch": 0.36414870476567135, "step": 3683, "train/total_loss": 0.16478502750396729 }, { "entropy": 8.789774894714355, "epoch": 0.3642475776151869, "mean_token_accuracy": 0.7694038152694702, "num_tokens": 20247990.0, "step": 3684, "train/ce_loss": 0.8095942139625549 }, { "epoch": 0.3642475776151869, "step": 3684, "train/sim_loss": 0.0703125 }, { "epoch": 0.3642475776151869, "step": 3684, "train/total_loss": 0.15127192437648773 }, { "entropy": 8.602739334106445, "epoch": 0.3643464504647024, "mean_token_accuracy": 0.7052529454231262, "num_tokens": 20253559.0, "step": 3685, "train/ce_loss": 0.46135780215263367 }, { "epoch": 0.3643464504647024, "step": 3685, "train/sim_loss": 0.05859375 }, { "epoch": 0.3643464504647024, "step": 3685, "train/total_loss": 0.1047295331954956 }, { "entropy": 9.29705810546875, "epoch": 0.3644453233142179, "mean_token_accuracy": 0.7771739363670349, "num_tokens": 20258820.0, "step": 3686, "train/ce_loss": 0.8265032768249512 }, { "epoch": 0.3644453233142179, "step": 3686, "train/sim_loss": 0.11328125 }, { "epoch": 0.3644453233142179, "step": 3686, "train/total_loss": 0.1959315836429596 }, { "entropy": 8.816841125488281, "epoch": 0.36454419616373346, "mean_token_accuracy": 0.7760158777236938, "num_tokens": 20264476.0, "step": 3687, "train/ce_loss": 0.6165051460266113 }, { "epoch": 0.36454419616373346, "step": 3687, "train/sim_loss": 0.0390625 }, { "epoch": 0.36454419616373346, "step": 3687, "train/total_loss": 0.10071301460266113 }, { "entropy": 8.707780838012695, "epoch": 0.36464306901324894, "mean_token_accuracy": 0.8098859190940857, "num_tokens": 20270203.0, "step": 3688, "train/ce_loss": 0.5418565273284912 }, { "epoch": 0.36464306901324894, "step": 3688, "train/sim_loss": 0.06640625 }, { "epoch": 0.36464306901324894, "step": 3688, "train/total_loss": 0.1205919086933136 }, { "entropy": 8.723261833190918, "epoch": 0.3647419418627645, "mean_token_accuracy": 0.7393665313720703, "num_tokens": 20275865.0, "step": 3689, "train/ce_loss": 0.6183147430419922 }, { "epoch": 0.3647419418627645, "step": 3689, "train/sim_loss": 0.140625 }, { "epoch": 0.3647419418627645, "step": 3689, "train/total_loss": 0.20245647430419922 }, { "entropy": 9.215544700622559, "epoch": 0.36484081471228, "mean_token_accuracy": 0.7154861688613892, "num_tokens": 20281348.0, "step": 3690, "train/ce_loss": 0.624721884727478 }, { "epoch": 0.36484081471228, "step": 3690, "train/sim_loss": 0.0703125 }, { "epoch": 0.36484081471228, "step": 3690, "train/total_loss": 0.13278469443321228 }, { "entropy": 8.916409492492676, "epoch": 0.3649396875617955, "mean_token_accuracy": 0.7887029051780701, "num_tokens": 20286971.0, "step": 3691, "train/ce_loss": 0.9020763635635376 }, { "epoch": 0.3649396875617955, "step": 3691, "train/sim_loss": 0.13671875 }, { "epoch": 0.3649396875617955, "step": 3691, "train/total_loss": 0.22692638635635376 }, { "entropy": 8.893112182617188, "epoch": 0.36503856041131105, "mean_token_accuracy": 0.7993079423904419, "num_tokens": 20292446.0, "step": 3692, "train/ce_loss": 0.6157128214836121 }, { "epoch": 0.36503856041131105, "step": 3692, "train/sim_loss": 0.0390625 }, { "epoch": 0.36503856041131105, "step": 3692, "train/total_loss": 0.10063378512859344 }, { "entropy": 8.612115859985352, "epoch": 0.3651374332608266, "mean_token_accuracy": 0.733668327331543, "num_tokens": 20298303.0, "step": 3693, "train/ce_loss": 0.3946406841278076 }, { "epoch": 0.3651374332608266, "step": 3693, "train/sim_loss": 0.0234375 }, { "epoch": 0.3651374332608266, "step": 3693, "train/total_loss": 0.062901571393013 }, { "entropy": 8.924064636230469, "epoch": 0.3652363061103421, "mean_token_accuracy": 0.7248182892799377, "num_tokens": 20303852.0, "step": 3694, "train/ce_loss": 1.382340908050537 }, { "epoch": 0.3652363061103421, "step": 3694, "train/sim_loss": 0.0546875 }, { "epoch": 0.3652363061103421, "step": 3694, "train/total_loss": 0.19292159378528595 }, { "entropy": 8.721240997314453, "epoch": 0.3653351789598576, "mean_token_accuracy": 0.744027316570282, "num_tokens": 20309360.0, "step": 3695, "train/ce_loss": 0.8917349576950073 }, { "epoch": 0.3653351789598576, "step": 3695, "train/sim_loss": 0.03515625 }, { "epoch": 0.3653351789598576, "step": 3695, "train/total_loss": 0.12432974576950073 }, { "entropy": 8.77725601196289, "epoch": 0.36543405180937316, "mean_token_accuracy": 0.7160493731498718, "num_tokens": 20315068.0, "step": 3696, "train/ce_loss": 0.5007692575454712 }, { "epoch": 0.36543405180937316, "step": 3696, "train/sim_loss": 0.05859375 }, { "epoch": 0.36543405180937316, "step": 3696, "train/total_loss": 0.1086706817150116 }, { "entropy": 9.140199661254883, "epoch": 0.36553292465888865, "mean_token_accuracy": 0.7730138897895813, "num_tokens": 20320417.0, "step": 3697, "train/ce_loss": 0.3234008252620697 }, { "epoch": 0.36553292465888865, "step": 3697, "train/sim_loss": 0.1015625 }, { "epoch": 0.36553292465888865, "step": 3697, "train/total_loss": 0.13390257954597473 }, { "entropy": 8.97885513305664, "epoch": 0.3656317975084042, "mean_token_accuracy": 0.729903519153595, "num_tokens": 20325880.0, "step": 3698, "train/ce_loss": 0.7937451601028442 }, { "epoch": 0.3656317975084042, "step": 3698, "train/sim_loss": 0.05859375 }, { "epoch": 0.3656317975084042, "step": 3698, "train/total_loss": 0.1379682719707489 }, { "entropy": 8.839160919189453, "epoch": 0.36573067035791973, "mean_token_accuracy": 0.7684085369110107, "num_tokens": 20331405.0, "step": 3699, "train/ce_loss": 0.31463223695755005 }, { "epoch": 0.36573067035791973, "step": 3699, "train/sim_loss": 0.0234375 }, { "epoch": 0.36573067035791973, "step": 3699, "train/total_loss": 0.054900724440813065 }, { "epoch": 0.3658295432074352, "grad_norm": 0.7615626454353333, "learning_rate": 9.087919695396332e-06, "loss": 0.1382, "step": 3700 }, { "entropy": 8.461156845092773, "epoch": 0.3658295432074352, "mean_token_accuracy": 0.6966205835342407, "num_tokens": 20337309.0, "step": 3700, "train/ce_loss": 0.5148777961730957 }, { "epoch": 0.3658295432074352, "step": 3700, "train/sim_loss": 0.0625 }, { "epoch": 0.3658295432074352, "step": 3700, "train/total_loss": 0.11398778110742569 }, { "entropy": 9.197784423828125, "epoch": 0.36592841605695076, "mean_token_accuracy": 0.782608687877655, "num_tokens": 20342643.0, "step": 3701, "train/ce_loss": 0.8089616894721985 }, { "epoch": 0.36592841605695076, "step": 3701, "train/sim_loss": 0.03515625 }, { "epoch": 0.36592841605695076, "step": 3701, "train/total_loss": 0.11605241894721985 }, { "entropy": 9.12997055053711, "epoch": 0.3660272889064663, "mean_token_accuracy": 0.7238219976425171, "num_tokens": 20347976.0, "step": 3702, "train/ce_loss": 0.9946410655975342 }, { "epoch": 0.3660272889064663, "step": 3702, "train/sim_loss": 0.05859375 }, { "epoch": 0.3660272889064663, "step": 3702, "train/total_loss": 0.15805786848068237 }, { "entropy": 8.866531372070312, "epoch": 0.3661261617559818, "mean_token_accuracy": 0.7429667711257935, "num_tokens": 20353333.0, "step": 3703, "train/ce_loss": 0.4512418210506439 }, { "epoch": 0.3661261617559818, "step": 3703, "train/sim_loss": 0.0546875 }, { "epoch": 0.3661261617559818, "step": 3703, "train/total_loss": 0.09981168806552887 }, { "entropy": 9.163649559020996, "epoch": 0.3662250346054973, "mean_token_accuracy": 0.8016529083251953, "num_tokens": 20358616.0, "step": 3704, "train/ce_loss": 0.6391116380691528 }, { "epoch": 0.3662250346054973, "step": 3704, "train/sim_loss": 0.02734375 }, { "epoch": 0.3662250346054973, "step": 3704, "train/total_loss": 0.09125491231679916 }, { "entropy": 9.146544456481934, "epoch": 0.36632390745501286, "mean_token_accuracy": 0.7415048480033875, "num_tokens": 20364239.0, "step": 3705, "train/ce_loss": 0.5862305760383606 }, { "epoch": 0.36632390745501286, "step": 3705, "train/sim_loss": 0.03515625 }, { "epoch": 0.36632390745501286, "step": 3705, "train/total_loss": 0.0937793105840683 }, { "entropy": 8.474640846252441, "epoch": 0.36642278030452835, "mean_token_accuracy": 0.7206029891967773, "num_tokens": 20369912.0, "step": 3706, "train/ce_loss": 0.5818215608596802 }, { "epoch": 0.36642278030452835, "step": 3706, "train/sim_loss": 0.0859375 }, { "epoch": 0.36642278030452835, "step": 3706, "train/total_loss": 0.14411965012550354 }, { "entropy": 8.810266494750977, "epoch": 0.3665216531540439, "mean_token_accuracy": 0.7451403737068176, "num_tokens": 20375448.0, "step": 3707, "train/ce_loss": 0.6795127391815186 }, { "epoch": 0.3665216531540439, "step": 3707, "train/sim_loss": 0.0234375 }, { "epoch": 0.3665216531540439, "step": 3707, "train/total_loss": 0.0913887768983841 }, { "entropy": 9.21399211883545, "epoch": 0.36662052600355943, "mean_token_accuracy": 0.744332492351532, "num_tokens": 20380815.0, "step": 3708, "train/ce_loss": 0.7412590384483337 }, { "epoch": 0.36662052600355943, "step": 3708, "train/sim_loss": 0.0390625 }, { "epoch": 0.36662052600355943, "step": 3708, "train/total_loss": 0.11318840831518173 }, { "entropy": 8.641327857971191, "epoch": 0.3667193988530749, "mean_token_accuracy": 0.744053304195404, "num_tokens": 20386499.0, "step": 3709, "train/ce_loss": 1.0999870300292969 }, { "epoch": 0.3667193988530749, "step": 3709, "train/sim_loss": 0.140625 }, { "epoch": 0.3667193988530749, "step": 3709, "train/total_loss": 0.2506237030029297 }, { "entropy": 9.093932151794434, "epoch": 0.36681827170259046, "mean_token_accuracy": 0.7414500713348389, "num_tokens": 20391776.0, "step": 3710, "train/ce_loss": 0.7837632298469543 }, { "epoch": 0.36681827170259046, "step": 3710, "train/sim_loss": 0.05078125 }, { "epoch": 0.36681827170259046, "step": 3710, "train/total_loss": 0.12915757298469543 }, { "entropy": 9.183428764343262, "epoch": 0.366917144552106, "mean_token_accuracy": 0.740359902381897, "num_tokens": 20397154.0, "step": 3711, "train/ce_loss": 0.6108941435813904 }, { "epoch": 0.366917144552106, "step": 3711, "train/sim_loss": 0.0546875 }, { "epoch": 0.366917144552106, "step": 3711, "train/total_loss": 0.1157769113779068 }, { "entropy": 9.295446395874023, "epoch": 0.36701601740162154, "mean_token_accuracy": 0.7176634073257446, "num_tokens": 20402462.0, "step": 3712, "train/ce_loss": 0.5010387897491455 }, { "epoch": 0.36701601740162154, "step": 3712, "train/sim_loss": 0.046875 }, { "epoch": 0.36701601740162154, "step": 3712, "train/total_loss": 0.09697888046503067 }, { "entropy": 8.647969245910645, "epoch": 0.367114890251137, "mean_token_accuracy": 0.7796609997749329, "num_tokens": 20408113.0, "step": 3713, "train/ce_loss": 0.7727713584899902 }, { "epoch": 0.367114890251137, "step": 3713, "train/sim_loss": 0.046875 }, { "epoch": 0.367114890251137, "step": 3713, "train/total_loss": 0.12415213882923126 }, { "entropy": 8.54621410369873, "epoch": 0.36721376310065257, "mean_token_accuracy": 0.7162944674491882, "num_tokens": 20413961.0, "step": 3714, "train/ce_loss": 0.6597913503646851 }, { "epoch": 0.36721376310065257, "step": 3714, "train/sim_loss": 0.03515625 }, { "epoch": 0.36721376310065257, "step": 3714, "train/total_loss": 0.10113538801670074 }, { "entropy": 8.563344955444336, "epoch": 0.3673126359501681, "mean_token_accuracy": 0.7178349494934082, "num_tokens": 20419597.0, "step": 3715, "train/ce_loss": 1.5820780992507935 }, { "epoch": 0.3673126359501681, "step": 3715, "train/sim_loss": 0.03515625 }, { "epoch": 0.3673126359501681, "step": 3715, "train/total_loss": 0.19336406886577606 }, { "entropy": 8.95662784576416, "epoch": 0.3674115087996836, "mean_token_accuracy": 0.7925407886505127, "num_tokens": 20425045.0, "step": 3716, "train/ce_loss": 0.5323399305343628 }, { "epoch": 0.3674115087996836, "step": 3716, "train/sim_loss": 0.0390625 }, { "epoch": 0.3674115087996836, "step": 3716, "train/total_loss": 0.09229649603366852 }, { "entropy": 8.976122856140137, "epoch": 0.36751038164919914, "mean_token_accuracy": 0.7416148781776428, "num_tokens": 20430446.0, "step": 3717, "train/ce_loss": 0.460513174533844 }, { "epoch": 0.36751038164919914, "step": 3717, "train/sim_loss": 0.078125 }, { "epoch": 0.36751038164919914, "step": 3717, "train/total_loss": 0.12417632341384888 }, { "entropy": 9.067848205566406, "epoch": 0.3676092544987147, "mean_token_accuracy": 0.7189384698867798, "num_tokens": 20435876.0, "step": 3718, "train/ce_loss": 0.944519579410553 }, { "epoch": 0.3676092544987147, "step": 3718, "train/sim_loss": 0.06640625 }, { "epoch": 0.3676092544987147, "step": 3718, "train/total_loss": 0.16085821390151978 }, { "entropy": 8.574031829833984, "epoch": 0.36770812734823016, "mean_token_accuracy": 0.7422497868537903, "num_tokens": 20441613.0, "step": 3719, "train/ce_loss": 2.0981812477111816 }, { "epoch": 0.36770812734823016, "step": 3719, "train/sim_loss": 0.0703125 }, { "epoch": 0.36770812734823016, "step": 3719, "train/total_loss": 0.28013062477111816 }, { "epoch": 0.3678070001977457, "grad_norm": 0.6511332392692566, "learning_rate": 9.082974830638382e-06, "loss": 0.1394, "step": 3720 }, { "entropy": 8.801862716674805, "epoch": 0.3678070001977457, "mean_token_accuracy": 0.7543859481811523, "num_tokens": 20447244.0, "step": 3720, "train/ce_loss": 0.5542744994163513 }, { "epoch": 0.3678070001977457, "step": 3720, "train/sim_loss": 0.046875 }, { "epoch": 0.3678070001977457, "step": 3720, "train/total_loss": 0.1023024469614029 }, { "entropy": 8.893808364868164, "epoch": 0.36790587304726124, "mean_token_accuracy": 0.7595375776290894, "num_tokens": 20452643.0, "step": 3721, "train/ce_loss": 1.1890771389007568 }, { "epoch": 0.36790587304726124, "step": 3721, "train/sim_loss": 0.05859375 }, { "epoch": 0.36790587304726124, "step": 3721, "train/total_loss": 0.17750146985054016 }, { "entropy": 8.643001556396484, "epoch": 0.36800474589677673, "mean_token_accuracy": 0.7250821590423584, "num_tokens": 20458190.0, "step": 3722, "train/ce_loss": 1.050977110862732 }, { "epoch": 0.36800474589677673, "step": 3722, "train/sim_loss": 0.03515625 }, { "epoch": 0.36800474589677673, "step": 3722, "train/total_loss": 0.1402539610862732 }, { "entropy": 9.074426651000977, "epoch": 0.36810361874629227, "mean_token_accuracy": 0.7443249821662903, "num_tokens": 20463617.0, "step": 3723, "train/ce_loss": 0.6282286047935486 }, { "epoch": 0.36810361874629227, "step": 3723, "train/sim_loss": 0.01953125 }, { "epoch": 0.36810361874629227, "step": 3723, "train/total_loss": 0.0823541134595871 }, { "entropy": 8.322723388671875, "epoch": 0.3682024915958078, "mean_token_accuracy": 0.7041229009628296, "num_tokens": 20469497.0, "step": 3724, "train/ce_loss": 2.071514368057251 }, { "epoch": 0.3682024915958078, "step": 3724, "train/sim_loss": 0.0625 }, { "epoch": 0.3682024915958078, "step": 3724, "train/total_loss": 0.2696514427661896 }, { "entropy": 9.053664207458496, "epoch": 0.3683013644453233, "mean_token_accuracy": 0.73235684633255, "num_tokens": 20474812.0, "step": 3725, "train/ce_loss": 0.9611563086509705 }, { "epoch": 0.3683013644453233, "step": 3725, "train/sim_loss": 0.0546875 }, { "epoch": 0.3683013644453233, "step": 3725, "train/total_loss": 0.15080313384532928 }, { "entropy": 8.433722496032715, "epoch": 0.36840023729483884, "mean_token_accuracy": 0.6919795274734497, "num_tokens": 20480553.0, "step": 3726, "train/ce_loss": 1.141463041305542 }, { "epoch": 0.36840023729483884, "step": 3726, "train/sim_loss": 0.05859375 }, { "epoch": 0.36840023729483884, "step": 3726, "train/total_loss": 0.17274005711078644 }, { "entropy": 8.68380069732666, "epoch": 0.3684991101443544, "mean_token_accuracy": 0.7761351466178894, "num_tokens": 20486188.0, "step": 3727, "train/ce_loss": 0.8290035724639893 }, { "epoch": 0.3684991101443544, "step": 3727, "train/sim_loss": 0.05859375 }, { "epoch": 0.3684991101443544, "step": 3727, "train/total_loss": 0.14149411022663116 }, { "entropy": 8.630732536315918, "epoch": 0.36859798299386987, "mean_token_accuracy": 0.7511467933654785, "num_tokens": 20491632.0, "step": 3728, "train/ce_loss": 0.712047278881073 }, { "epoch": 0.36859798299386987, "step": 3728, "train/sim_loss": 0.0625 }, { "epoch": 0.36859798299386987, "step": 3728, "train/total_loss": 0.13370472192764282 }, { "entropy": 8.662714958190918, "epoch": 0.3686968558433854, "mean_token_accuracy": 0.7513513565063477, "num_tokens": 20497073.0, "step": 3729, "train/ce_loss": 0.670468807220459 }, { "epoch": 0.3686968558433854, "step": 3729, "train/sim_loss": 0.0390625 }, { "epoch": 0.3686968558433854, "step": 3729, "train/total_loss": 0.1061093807220459 }, { "entropy": 9.1618013381958, "epoch": 0.36879572869290095, "mean_token_accuracy": 0.7405660152435303, "num_tokens": 20502511.0, "step": 3730, "train/ce_loss": 0.640701413154602 }, { "epoch": 0.36879572869290095, "step": 3730, "train/sim_loss": 0.03125 }, { "epoch": 0.36879572869290095, "step": 3730, "train/total_loss": 0.09532014280557632 }, { "entropy": 8.78432846069336, "epoch": 0.36889460154241643, "mean_token_accuracy": 0.8070374727249146, "num_tokens": 20507944.0, "step": 3731, "train/ce_loss": 0.8137669563293457 }, { "epoch": 0.36889460154241643, "step": 3731, "train/sim_loss": 0.05078125 }, { "epoch": 0.36889460154241643, "step": 3731, "train/total_loss": 0.13215795159339905 }, { "entropy": 9.447261810302734, "epoch": 0.368993474391932, "mean_token_accuracy": 0.7613104581832886, "num_tokens": 20513131.0, "step": 3732, "train/ce_loss": 0.9207032322883606 }, { "epoch": 0.368993474391932, "step": 3732, "train/sim_loss": 0.03125 }, { "epoch": 0.368993474391932, "step": 3732, "train/total_loss": 0.1233203262090683 }, { "entropy": 8.693253517150879, "epoch": 0.3690923472414475, "mean_token_accuracy": 0.7756654024124146, "num_tokens": 20518826.0, "step": 3733, "train/ce_loss": 1.0943294763565063 }, { "epoch": 0.3690923472414475, "step": 3733, "train/sim_loss": 0.0625 }, { "epoch": 0.3690923472414475, "step": 3733, "train/total_loss": 0.17193295061588287 }, { "entropy": 8.969476699829102, "epoch": 0.369191220090963, "mean_token_accuracy": 0.7488687634468079, "num_tokens": 20524357.0, "step": 3734, "train/ce_loss": 0.97007155418396 }, { "epoch": 0.369191220090963, "step": 3734, "train/sim_loss": 0.046875 }, { "epoch": 0.369191220090963, "step": 3734, "train/total_loss": 0.143882155418396 }, { "entropy": 8.736095428466797, "epoch": 0.36929009294047854, "mean_token_accuracy": 0.7595375776290894, "num_tokens": 20529849.0, "step": 3735, "train/ce_loss": 0.7521677613258362 }, { "epoch": 0.36929009294047854, "step": 3735, "train/sim_loss": 0.0859375 }, { "epoch": 0.36929009294047854, "step": 3735, "train/total_loss": 0.16115427017211914 }, { "entropy": 8.920188903808594, "epoch": 0.3693889657899941, "mean_token_accuracy": 0.7447368502616882, "num_tokens": 20535261.0, "step": 3736, "train/ce_loss": 0.5077464580535889 }, { "epoch": 0.3693889657899941, "step": 3736, "train/sim_loss": 0.0703125 }, { "epoch": 0.3693889657899941, "step": 3736, "train/total_loss": 0.12108714878559113 }, { "entropy": 8.85544490814209, "epoch": 0.36948783863950957, "mean_token_accuracy": 0.7117008566856384, "num_tokens": 20540697.0, "step": 3737, "train/ce_loss": 0.7742788195610046 }, { "epoch": 0.36948783863950957, "step": 3737, "train/sim_loss": 0.05859375 }, { "epoch": 0.36948783863950957, "step": 3737, "train/total_loss": 0.13602164387702942 }, { "entropy": 9.062042236328125, "epoch": 0.3695867114890251, "mean_token_accuracy": 0.7067183256149292, "num_tokens": 20546066.0, "step": 3738, "train/ce_loss": 0.6568614840507507 }, { "epoch": 0.3695867114890251, "step": 3738, "train/sim_loss": 0.0703125 }, { "epoch": 0.3695867114890251, "step": 3738, "train/total_loss": 0.1359986513853073 }, { "entropy": 8.591753959655762, "epoch": 0.36968558433854065, "mean_token_accuracy": 0.7738232016563416, "num_tokens": 20551627.0, "step": 3739, "train/ce_loss": 0.9568700790405273 }, { "epoch": 0.36968558433854065, "step": 3739, "train/sim_loss": 0.0546875 }, { "epoch": 0.36968558433854065, "step": 3739, "train/total_loss": 0.15037450194358826 }, { "epoch": 0.36978445718805614, "grad_norm": 0.6754093170166016, "learning_rate": 9.078029965880434e-06, "loss": 0.137, "step": 3740 }, { "entropy": 8.914134979248047, "epoch": 0.36978445718805614, "mean_token_accuracy": 0.7600979208946228, "num_tokens": 20557101.0, "step": 3740, "train/ce_loss": 0.6815298199653625 }, { "epoch": 0.36978445718805614, "step": 3740, "train/sim_loss": 0.06640625 }, { "epoch": 0.36978445718805614, "step": 3740, "train/total_loss": 0.1345592439174652 }, { "entropy": 8.848970413208008, "epoch": 0.3698833300375717, "mean_token_accuracy": 0.7970479726791382, "num_tokens": 20562580.0, "step": 3741, "train/ce_loss": 0.7132302522659302 }, { "epoch": 0.3698833300375717, "step": 3741, "train/sim_loss": 0.0234375 }, { "epoch": 0.3698833300375717, "step": 3741, "train/total_loss": 0.09476052969694138 }, { "entropy": 8.95608139038086, "epoch": 0.3699822028870872, "mean_token_accuracy": 0.7310810685157776, "num_tokens": 20567921.0, "step": 3742, "train/ce_loss": 1.137218952178955 }, { "epoch": 0.3699822028870872, "step": 3742, "train/sim_loss": 0.0625 }, { "epoch": 0.3699822028870872, "step": 3742, "train/total_loss": 0.17622190713882446 }, { "entropy": 8.735328674316406, "epoch": 0.3700810757366027, "mean_token_accuracy": 0.7581344842910767, "num_tokens": 20573456.0, "step": 3743, "train/ce_loss": 0.788256049156189 }, { "epoch": 0.3700810757366027, "step": 3743, "train/sim_loss": 0.07421875 }, { "epoch": 0.3700810757366027, "step": 3743, "train/total_loss": 0.15304435789585114 }, { "entropy": 9.137748718261719, "epoch": 0.37017994858611825, "mean_token_accuracy": 0.7381275296211243, "num_tokens": 20578860.0, "step": 3744, "train/ce_loss": 0.9805884957313538 }, { "epoch": 0.37017994858611825, "step": 3744, "train/sim_loss": 0.0703125 }, { "epoch": 0.37017994858611825, "step": 3744, "train/total_loss": 0.16837134957313538 }, { "entropy": 8.571564674377441, "epoch": 0.3702788214356338, "mean_token_accuracy": 0.7560483813285828, "num_tokens": 20584523.0, "step": 3745, "train/ce_loss": 0.5152580738067627 }, { "epoch": 0.3702788214356338, "step": 3745, "train/sim_loss": 0.0859375 }, { "epoch": 0.3702788214356338, "step": 3745, "train/total_loss": 0.1374633014202118 }, { "entropy": 8.490943908691406, "epoch": 0.3703776942851493, "mean_token_accuracy": 0.7176901698112488, "num_tokens": 20590211.0, "step": 3746, "train/ce_loss": 1.7508798837661743 }, { "epoch": 0.3703776942851493, "step": 3746, "train/sim_loss": 0.03125 }, { "epoch": 0.3703776942851493, "step": 3746, "train/total_loss": 0.20633798837661743 }, { "entropy": 8.727965354919434, "epoch": 0.3704765671346648, "mean_token_accuracy": 0.8098591566085815, "num_tokens": 20595739.0, "step": 3747, "train/ce_loss": 0.40634629130363464 }, { "epoch": 0.3704765671346648, "step": 3747, "train/sim_loss": 0.0625 }, { "epoch": 0.3704765671346648, "step": 3747, "train/total_loss": 0.1031346321105957 }, { "entropy": 8.211786270141602, "epoch": 0.37057543998418035, "mean_token_accuracy": 0.7457113862037659, "num_tokens": 20601455.0, "step": 3748, "train/ce_loss": 0.704289972782135 }, { "epoch": 0.37057543998418035, "step": 3748, "train/sim_loss": 0.09765625 }, { "epoch": 0.37057543998418035, "step": 3748, "train/total_loss": 0.1680852472782135 }, { "entropy": 8.882055282592773, "epoch": 0.37067431283369584, "mean_token_accuracy": 0.7737665176391602, "num_tokens": 20606885.0, "step": 3749, "train/ce_loss": 0.6541991233825684 }, { "epoch": 0.37067431283369584, "step": 3749, "train/sim_loss": 0.0546875 }, { "epoch": 0.37067431283369584, "step": 3749, "train/total_loss": 0.12010741233825684 }, { "entropy": 8.551878929138184, "epoch": 0.3707731856832114, "mean_token_accuracy": 0.7922077775001526, "num_tokens": 20612437.0, "step": 3750, "train/ce_loss": 0.8463929891586304 }, { "epoch": 0.3707731856832114, "step": 3750, "train/sim_loss": 0.0625 }, { "epoch": 0.3707731856832114, "step": 3750, "train/total_loss": 0.147139310836792 }, { "entropy": 8.661998748779297, "epoch": 0.3708720585327269, "mean_token_accuracy": 0.6893095970153809, "num_tokens": 20617951.0, "step": 3751, "train/ce_loss": 0.7420273423194885 }, { "epoch": 0.3708720585327269, "step": 3751, "train/sim_loss": 0.0390625 }, { "epoch": 0.3708720585327269, "step": 3751, "train/total_loss": 0.11326523870229721 }, { "entropy": 8.638775825500488, "epoch": 0.37097093138224246, "mean_token_accuracy": 0.7585799098014832, "num_tokens": 20623446.0, "step": 3752, "train/ce_loss": 0.830921471118927 }, { "epoch": 0.37097093138224246, "step": 3752, "train/sim_loss": 0.08984375 }, { "epoch": 0.37097093138224246, "step": 3752, "train/total_loss": 0.17293590307235718 }, { "entropy": 8.445999145507812, "epoch": 0.37106980423175795, "mean_token_accuracy": 0.7492931485176086, "num_tokens": 20629121.0, "step": 3753, "train/ce_loss": 0.49216845631599426 }, { "epoch": 0.37106980423175795, "step": 3753, "train/sim_loss": 0.05078125 }, { "epoch": 0.37106980423175795, "step": 3753, "train/total_loss": 0.0999981015920639 }, { "entropy": 8.617786407470703, "epoch": 0.3711686770812735, "mean_token_accuracy": 0.7518097162246704, "num_tokens": 20634756.0, "step": 3754, "train/ce_loss": 0.9528414011001587 }, { "epoch": 0.3711686770812735, "step": 3754, "train/sim_loss": 0.08984375 }, { "epoch": 0.3711686770812735, "step": 3754, "train/total_loss": 0.1851278841495514 }, { "entropy": 8.93307113647461, "epoch": 0.37126754993078903, "mean_token_accuracy": 0.7158671617507935, "num_tokens": 20640213.0, "step": 3755, "train/ce_loss": 0.9779796004295349 }, { "epoch": 0.37126754993078903, "step": 3755, "train/sim_loss": 0.05078125 }, { "epoch": 0.37126754993078903, "step": 3755, "train/total_loss": 0.1485792100429535 }, { "entropy": 8.496614456176758, "epoch": 0.3713664227803045, "mean_token_accuracy": 0.7623853087425232, "num_tokens": 20645846.0, "step": 3756, "train/ce_loss": 0.6516854763031006 }, { "epoch": 0.3713664227803045, "step": 3756, "train/sim_loss": 0.0625 }, { "epoch": 0.3713664227803045, "step": 3756, "train/total_loss": 0.127668559551239 }, { "entropy": 8.934723854064941, "epoch": 0.37146529562982006, "mean_token_accuracy": 0.7490397095680237, "num_tokens": 20651249.0, "step": 3757, "train/ce_loss": 0.6794103980064392 }, { "epoch": 0.37146529562982006, "step": 3757, "train/sim_loss": 0.05859375 }, { "epoch": 0.37146529562982006, "step": 3757, "train/total_loss": 0.12653478980064392 }, { "entropy": 9.122142791748047, "epoch": 0.3715641684793356, "mean_token_accuracy": 0.7472178339958191, "num_tokens": 20656479.0, "step": 3758, "train/ce_loss": 0.8393281102180481 }, { "epoch": 0.3715641684793356, "step": 3758, "train/sim_loss": 0.08203125 }, { "epoch": 0.3715641684793356, "step": 3758, "train/total_loss": 0.1659640669822693 }, { "entropy": 8.826394081115723, "epoch": 0.3716630413288511, "mean_token_accuracy": 0.729608952999115, "num_tokens": 20661979.0, "step": 3759, "train/ce_loss": 0.5229291915893555 }, { "epoch": 0.3716630413288511, "step": 3759, "train/sim_loss": 0.0390625 }, { "epoch": 0.3716630413288511, "step": 3759, "train/total_loss": 0.09135542064905167 }, { "epoch": 0.3717619141783666, "grad_norm": 0.8478018641471863, "learning_rate": 9.073085101122485e-06, "loss": 0.1399, "step": 3760 }, { "entropy": 8.868673324584961, "epoch": 0.3717619141783666, "mean_token_accuracy": 0.7564102411270142, "num_tokens": 20667533.0, "step": 3760, "train/ce_loss": 1.0167475938796997 }, { "epoch": 0.3717619141783666, "step": 3760, "train/sim_loss": 0.07421875 }, { "epoch": 0.3717619141783666, "step": 3760, "train/total_loss": 0.17589351534843445 }, { "entropy": 8.478425979614258, "epoch": 0.37186078702788217, "mean_token_accuracy": 0.7453142404556274, "num_tokens": 20673113.0, "step": 3761, "train/ce_loss": 0.6577131152153015 }, { "epoch": 0.37186078702788217, "step": 3761, "train/sim_loss": 0.015625 }, { "epoch": 0.37186078702788217, "step": 3761, "train/total_loss": 0.08139631152153015 }, { "entropy": 8.780416488647461, "epoch": 0.37195965987739765, "mean_token_accuracy": 0.7472885251045227, "num_tokens": 20678594.0, "step": 3762, "train/ce_loss": 0.5779060125350952 }, { "epoch": 0.37195965987739765, "step": 3762, "train/sim_loss": 0.015625 }, { "epoch": 0.37195965987739765, "step": 3762, "train/total_loss": 0.073415607213974 }, { "entropy": 8.798819541931152, "epoch": 0.3720585327269132, "mean_token_accuracy": 0.7591069340705872, "num_tokens": 20683977.0, "step": 3763, "train/ce_loss": 0.9131091237068176 }, { "epoch": 0.3720585327269132, "step": 3763, "train/sim_loss": 0.07421875 }, { "epoch": 0.3720585327269132, "step": 3763, "train/total_loss": 0.16552966833114624 }, { "entropy": 8.499977111816406, "epoch": 0.37215740557642873, "mean_token_accuracy": 0.7220683097839355, "num_tokens": 20689780.0, "step": 3764, "train/ce_loss": 1.2852333784103394 }, { "epoch": 0.37215740557642873, "step": 3764, "train/sim_loss": 0.06640625 }, { "epoch": 0.37215740557642873, "step": 3764, "train/total_loss": 0.1949295848608017 }, { "entropy": 8.840158462524414, "epoch": 0.3722562784259442, "mean_token_accuracy": 0.7456647157669067, "num_tokens": 20695279.0, "step": 3765, "train/ce_loss": 0.6694163680076599 }, { "epoch": 0.3722562784259442, "step": 3765, "train/sim_loss": 0.09375 }, { "epoch": 0.3722562784259442, "step": 3765, "train/total_loss": 0.16069164872169495 }, { "entropy": 9.110905647277832, "epoch": 0.37235515127545976, "mean_token_accuracy": 0.7684659361839294, "num_tokens": 20700500.0, "step": 3766, "train/ce_loss": 0.6621342301368713 }, { "epoch": 0.37235515127545976, "step": 3766, "train/sim_loss": 0.03125 }, { "epoch": 0.37235515127545976, "step": 3766, "train/total_loss": 0.09746342152357101 }, { "entropy": 8.23089599609375, "epoch": 0.3724540241249753, "mean_token_accuracy": 0.7095435857772827, "num_tokens": 20706296.0, "step": 3767, "train/ce_loss": 1.9088373184204102 }, { "epoch": 0.3724540241249753, "step": 3767, "train/sim_loss": 0.0859375 }, { "epoch": 0.3724540241249753, "step": 3767, "train/total_loss": 0.2768212556838989 }, { "entropy": 8.817030906677246, "epoch": 0.3725528969744908, "mean_token_accuracy": 0.7530319690704346, "num_tokens": 20711826.0, "step": 3768, "train/ce_loss": 0.8140357732772827 }, { "epoch": 0.3725528969744908, "step": 3768, "train/sim_loss": 0.1171875 }, { "epoch": 0.3725528969744908, "step": 3768, "train/total_loss": 0.19859108328819275 }, { "entropy": 8.590417861938477, "epoch": 0.37265176982400633, "mean_token_accuracy": 0.7411527037620544, "num_tokens": 20717480.0, "step": 3769, "train/ce_loss": 0.7405049204826355 }, { "epoch": 0.37265176982400633, "step": 3769, "train/sim_loss": 0.05078125 }, { "epoch": 0.37265176982400633, "step": 3769, "train/total_loss": 0.12483174353837967 }, { "entropy": 8.382266998291016, "epoch": 0.37275064267352187, "mean_token_accuracy": 0.7269076108932495, "num_tokens": 20723106.0, "step": 3770, "train/ce_loss": 0.43995803594589233 }, { "epoch": 0.37275064267352187, "step": 3770, "train/sim_loss": 0.02734375 }, { "epoch": 0.37275064267352187, "step": 3770, "train/total_loss": 0.07133955508470535 }, { "entropy": 8.659772872924805, "epoch": 0.37284951552303736, "mean_token_accuracy": 0.7511363625526428, "num_tokens": 20728629.0, "step": 3771, "train/ce_loss": 0.9865226149559021 }, { "epoch": 0.37284951552303736, "step": 3771, "train/sim_loss": 0.046875 }, { "epoch": 0.37284951552303736, "step": 3771, "train/total_loss": 0.14552727341651917 }, { "entropy": 8.64848518371582, "epoch": 0.3729483883725529, "mean_token_accuracy": 0.7582417726516724, "num_tokens": 20734259.0, "step": 3772, "train/ce_loss": 0.5003005266189575 }, { "epoch": 0.3729483883725529, "step": 3772, "train/sim_loss": 0.0859375 }, { "epoch": 0.3729483883725529, "step": 3772, "train/total_loss": 0.13596755266189575 }, { "entropy": 8.94666576385498, "epoch": 0.37304726122206844, "mean_token_accuracy": 0.754878044128418, "num_tokens": 20739570.0, "step": 3773, "train/ce_loss": 0.6738657355308533 }, { "epoch": 0.37304726122206844, "step": 3773, "train/sim_loss": 0.05078125 }, { "epoch": 0.37304726122206844, "step": 3773, "train/total_loss": 0.11816782504320145 }, { "entropy": 9.407923698425293, "epoch": 0.3731461340715839, "mean_token_accuracy": 0.7785016298294067, "num_tokens": 20744708.0, "step": 3774, "train/ce_loss": 0.670421302318573 }, { "epoch": 0.3731461340715839, "step": 3774, "train/sim_loss": 0.01953125 }, { "epoch": 0.3731461340715839, "step": 3774, "train/total_loss": 0.08657338470220566 }, { "entropy": 8.786133766174316, "epoch": 0.37324500692109946, "mean_token_accuracy": 0.7320340871810913, "num_tokens": 20750164.0, "step": 3775, "train/ce_loss": 0.6107103228569031 }, { "epoch": 0.37324500692109946, "step": 3775, "train/sim_loss": 0.0859375 }, { "epoch": 0.37324500692109946, "step": 3775, "train/total_loss": 0.14700853824615479 }, { "entropy": 8.920676231384277, "epoch": 0.373343879770615, "mean_token_accuracy": 0.7383512258529663, "num_tokens": 20755548.0, "step": 3776, "train/ce_loss": 1.057492733001709 }, { "epoch": 0.373343879770615, "step": 3776, "train/sim_loss": 0.0546875 }, { "epoch": 0.373343879770615, "step": 3776, "train/total_loss": 0.16043677926063538 }, { "entropy": 8.712064743041992, "epoch": 0.3734427526201305, "mean_token_accuracy": 0.710497260093689, "num_tokens": 20761105.0, "step": 3777, "train/ce_loss": 1.205745816230774 }, { "epoch": 0.3734427526201305, "step": 3777, "train/sim_loss": 0.05078125 }, { "epoch": 0.3734427526201305, "step": 3777, "train/total_loss": 0.17135584354400635 }, { "entropy": 8.715276718139648, "epoch": 0.37354162546964603, "mean_token_accuracy": 0.792569637298584, "num_tokens": 20766711.0, "step": 3778, "train/ce_loss": 0.756398618221283 }, { "epoch": 0.37354162546964603, "step": 3778, "train/sim_loss": 0.08203125 }, { "epoch": 0.37354162546964603, "step": 3778, "train/total_loss": 0.15767112374305725 }, { "entropy": 8.760663986206055, "epoch": 0.3736404983191616, "mean_token_accuracy": 0.7437295317649841, "num_tokens": 20772254.0, "step": 3779, "train/ce_loss": 0.6027212738990784 }, { "epoch": 0.3736404983191616, "step": 3779, "train/sim_loss": 0.0234375 }, { "epoch": 0.3736404983191616, "step": 3779, "train/total_loss": 0.08370962738990784 }, { "epoch": 0.37373937116867706, "grad_norm": 0.7324458360671997, "learning_rate": 9.068140236364537e-06, "loss": 0.1421, "step": 3780 }, { "entropy": 8.81997299194336, "epoch": 0.37373937116867706, "mean_token_accuracy": 0.7590497732162476, "num_tokens": 20777670.0, "step": 3780, "train/ce_loss": 0.5722731947898865 }, { "epoch": 0.37373937116867706, "step": 3780, "train/sim_loss": 0.05859375 }, { "epoch": 0.37373937116867706, "step": 3780, "train/total_loss": 0.11582107096910477 }, { "entropy": 8.626713752746582, "epoch": 0.3738382440181926, "mean_token_accuracy": 0.7219770550727844, "num_tokens": 20783294.0, "step": 3781, "train/ce_loss": 0.5350469946861267 }, { "epoch": 0.3738382440181926, "step": 3781, "train/sim_loss": 0.0546875 }, { "epoch": 0.3738382440181926, "step": 3781, "train/total_loss": 0.10819220542907715 }, { "entropy": 8.682674407958984, "epoch": 0.37393711686770814, "mean_token_accuracy": 0.7635270357131958, "num_tokens": 20788963.0, "step": 3782, "train/ce_loss": 0.5108616352081299 }, { "epoch": 0.37393711686770814, "step": 3782, "train/sim_loss": 0.0234375 }, { "epoch": 0.37393711686770814, "step": 3782, "train/total_loss": 0.07452366501092911 }, { "entropy": 8.885889053344727, "epoch": 0.3740359897172236, "mean_token_accuracy": 0.7604166865348816, "num_tokens": 20794578.0, "step": 3783, "train/ce_loss": 0.9713773727416992 }, { "epoch": 0.3740359897172236, "step": 3783, "train/sim_loss": 0.0703125 }, { "epoch": 0.3740359897172236, "step": 3783, "train/total_loss": 0.16745024919509888 }, { "entropy": 8.728971481323242, "epoch": 0.37413486256673917, "mean_token_accuracy": 0.7426614761352539, "num_tokens": 20800241.0, "step": 3784, "train/ce_loss": 0.9498751163482666 }, { "epoch": 0.37413486256673917, "step": 3784, "train/sim_loss": 0.0859375 }, { "epoch": 0.37413486256673917, "step": 3784, "train/total_loss": 0.18092501163482666 }, { "entropy": 9.133694648742676, "epoch": 0.3742337354162547, "mean_token_accuracy": 0.7041666507720947, "num_tokens": 20805582.0, "step": 3785, "train/ce_loss": 0.7171686887741089 }, { "epoch": 0.3742337354162547, "step": 3785, "train/sim_loss": 0.07421875 }, { "epoch": 0.3742337354162547, "step": 3785, "train/total_loss": 0.14593562483787537 }, { "entropy": 9.078428268432617, "epoch": 0.3743326082657702, "mean_token_accuracy": 0.7873183488845825, "num_tokens": 20810902.0, "step": 3786, "train/ce_loss": 1.0280879735946655 }, { "epoch": 0.3743326082657702, "step": 3786, "train/sim_loss": 0.06640625 }, { "epoch": 0.3743326082657702, "step": 3786, "train/total_loss": 0.16921505331993103 }, { "entropy": 8.747568130493164, "epoch": 0.37443148111528574, "mean_token_accuracy": 0.6855345964431763, "num_tokens": 20816563.0, "step": 3787, "train/ce_loss": 0.8172718286514282 }, { "epoch": 0.37443148111528574, "step": 3787, "train/sim_loss": 0.0625 }, { "epoch": 0.37443148111528574, "step": 3787, "train/total_loss": 0.14422717690467834 }, { "entropy": 9.103796005249023, "epoch": 0.3745303539648013, "mean_token_accuracy": 0.7722646594047546, "num_tokens": 20821994.0, "step": 3788, "train/ce_loss": 0.4136883318424225 }, { "epoch": 0.3745303539648013, "step": 3788, "train/sim_loss": 0.05078125 }, { "epoch": 0.3745303539648013, "step": 3788, "train/total_loss": 0.09215008467435837 }, { "entropy": 8.80884075164795, "epoch": 0.37462922681431676, "mean_token_accuracy": 0.8215962648391724, "num_tokens": 20827495.0, "step": 3789, "train/ce_loss": 0.6907909512519836 }, { "epoch": 0.37462922681431676, "step": 3789, "train/sim_loss": 0.06640625 }, { "epoch": 0.37462922681431676, "step": 3789, "train/total_loss": 0.13548535108566284 }, { "entropy": 8.889633178710938, "epoch": 0.3747280996638323, "mean_token_accuracy": 0.7306867241859436, "num_tokens": 20833022.0, "step": 3790, "train/ce_loss": 0.6695831418037415 }, { "epoch": 0.3747280996638323, "step": 3790, "train/sim_loss": 0.0390625 }, { "epoch": 0.3747280996638323, "step": 3790, "train/total_loss": 0.10602081567049026 }, { "entropy": 8.867337226867676, "epoch": 0.37482697251334784, "mean_token_accuracy": 0.7799999713897705, "num_tokens": 20838381.0, "step": 3791, "train/ce_loss": 0.5080693960189819 }, { "epoch": 0.37482697251334784, "step": 3791, "train/sim_loss": 0.0625 }, { "epoch": 0.37482697251334784, "step": 3791, "train/total_loss": 0.1133069396018982 }, { "entropy": 8.966775894165039, "epoch": 0.37492584536286333, "mean_token_accuracy": 0.7225064039230347, "num_tokens": 20843817.0, "step": 3792, "train/ce_loss": 0.6632620096206665 }, { "epoch": 0.37492584536286333, "step": 3792, "train/sim_loss": 0.05078125 }, { "epoch": 0.37492584536286333, "step": 3792, "train/total_loss": 0.11710745096206665 }, { "entropy": 9.054179191589355, "epoch": 0.37502471821237887, "mean_token_accuracy": 0.7767123579978943, "num_tokens": 20849089.0, "step": 3793, "train/ce_loss": 0.769820511341095 }, { "epoch": 0.37502471821237887, "step": 3793, "train/sim_loss": 0.0390625 }, { "epoch": 0.37502471821237887, "step": 3793, "train/total_loss": 0.1160445511341095 }, { "entropy": 8.8021879196167, "epoch": 0.3751235910618944, "mean_token_accuracy": 0.7179803252220154, "num_tokens": 20854696.0, "step": 3794, "train/ce_loss": 0.7783365249633789 }, { "epoch": 0.3751235910618944, "step": 3794, "train/sim_loss": 0.109375 }, { "epoch": 0.3751235910618944, "step": 3794, "train/total_loss": 0.1872086524963379 }, { "entropy": 8.834619522094727, "epoch": 0.37522246391140995, "mean_token_accuracy": 0.715634822845459, "num_tokens": 20860317.0, "step": 3795, "train/ce_loss": 0.540078341960907 }, { "epoch": 0.37522246391140995, "step": 3795, "train/sim_loss": 0.0390625 }, { "epoch": 0.37522246391140995, "step": 3795, "train/total_loss": 0.09307033568620682 }, { "entropy": 8.679250717163086, "epoch": 0.37532133676092544, "mean_token_accuracy": 0.7760663628578186, "num_tokens": 20865754.0, "step": 3796, "train/ce_loss": 0.6622959971427917 }, { "epoch": 0.37532133676092544, "step": 3796, "train/sim_loss": 0.03515625 }, { "epoch": 0.37532133676092544, "step": 3796, "train/total_loss": 0.10138585418462753 }, { "entropy": 8.911635398864746, "epoch": 0.375420209610441, "mean_token_accuracy": 0.8127853870391846, "num_tokens": 20871270.0, "step": 3797, "train/ce_loss": 0.44012805819511414 }, { "epoch": 0.375420209610441, "step": 3797, "train/sim_loss": 0.05859375 }, { "epoch": 0.375420209610441, "step": 3797, "train/total_loss": 0.10260655730962753 }, { "entropy": 9.103458404541016, "epoch": 0.3755190824599565, "mean_token_accuracy": 0.7335058450698853, "num_tokens": 20876645.0, "step": 3798, "train/ce_loss": 0.8721626996994019 }, { "epoch": 0.3755190824599565, "step": 3798, "train/sim_loss": 0.05078125 }, { "epoch": 0.3755190824599565, "step": 3798, "train/total_loss": 0.13799752295017242 }, { "entropy": 8.536456108093262, "epoch": 0.375617955309472, "mean_token_accuracy": 0.7493138313293457, "num_tokens": 20882324.0, "step": 3799, "train/ce_loss": 1.6114263534545898 }, { "epoch": 0.375617955309472, "step": 3799, "train/sim_loss": 0.0546875 }, { "epoch": 0.375617955309472, "step": 3799, "train/total_loss": 0.21583013236522675 }, { "epoch": 0.37571682815898755, "grad_norm": 0.6548178791999817, "learning_rate": 9.063195371606588e-06, "loss": 0.1368, "step": 3800 }, { "entropy": 8.973831176757812, "epoch": 0.37571682815898755, "mean_token_accuracy": 0.7886792421340942, "num_tokens": 20887769.0, "step": 3800, "train/ce_loss": 0.6508644819259644 }, { "epoch": 0.37571682815898755, "step": 3800, "train/sim_loss": 0.0390625 }, { "epoch": 0.37571682815898755, "step": 3800, "train/total_loss": 0.10414894670248032 }, { "entropy": 8.830668449401855, "epoch": 0.3758157010085031, "mean_token_accuracy": 0.7386116981506348, "num_tokens": 20893324.0, "step": 3801, "train/ce_loss": 0.9182262420654297 }, { "epoch": 0.3758157010085031, "step": 3801, "train/sim_loss": 0.02734375 }, { "epoch": 0.3758157010085031, "step": 3801, "train/total_loss": 0.11916637420654297 }, { "entropy": 8.841711044311523, "epoch": 0.3759145738580186, "mean_token_accuracy": 0.746835470199585, "num_tokens": 20898863.0, "step": 3802, "train/ce_loss": 0.8652566075325012 }, { "epoch": 0.3759145738580186, "step": 3802, "train/sim_loss": 0.1015625 }, { "epoch": 0.3759145738580186, "step": 3802, "train/total_loss": 0.18808816373348236 }, { "entropy": 8.797103881835938, "epoch": 0.3760134467075341, "mean_token_accuracy": 0.7824019193649292, "num_tokens": 20904351.0, "step": 3803, "train/ce_loss": 1.3885899782180786 }, { "epoch": 0.3760134467075341, "step": 3803, "train/sim_loss": 0.109375 }, { "epoch": 0.3760134467075341, "step": 3803, "train/total_loss": 0.24823400378227234 }, { "entropy": 9.082989692687988, "epoch": 0.37611231955704966, "mean_token_accuracy": 0.7745097875595093, "num_tokens": 20909664.0, "step": 3804, "train/ce_loss": 0.8082454204559326 }, { "epoch": 0.37611231955704966, "step": 3804, "train/sim_loss": 0.0625 }, { "epoch": 0.37611231955704966, "step": 3804, "train/total_loss": 0.14332455396652222 }, { "entropy": 8.682798385620117, "epoch": 0.37621119240656514, "mean_token_accuracy": 0.7897142767906189, "num_tokens": 20915111.0, "step": 3805, "train/ce_loss": 0.703971266746521 }, { "epoch": 0.37621119240656514, "step": 3805, "train/sim_loss": 0.05078125 }, { "epoch": 0.37621119240656514, "step": 3805, "train/total_loss": 0.12117838114500046 }, { "entropy": 8.661231994628906, "epoch": 0.3763100652560807, "mean_token_accuracy": 0.7603639960289001, "num_tokens": 20920662.0, "step": 3806, "train/ce_loss": 1.5175753831863403 }, { "epoch": 0.3763100652560807, "step": 3806, "train/sim_loss": 0.0625 }, { "epoch": 0.3763100652560807, "step": 3806, "train/total_loss": 0.21425753831863403 }, { "entropy": 8.539913177490234, "epoch": 0.3764089381055962, "mean_token_accuracy": 0.74410480260849, "num_tokens": 20926413.0, "step": 3807, "train/ce_loss": 0.33939796686172485 }, { "epoch": 0.3764089381055962, "step": 3807, "train/sim_loss": 0.05078125 }, { "epoch": 0.3764089381055962, "step": 3807, "train/total_loss": 0.08472104370594025 }, { "entropy": 8.898093223571777, "epoch": 0.3765078109551117, "mean_token_accuracy": 0.8004201650619507, "num_tokens": 20931957.0, "step": 3808, "train/ce_loss": 0.6008527278900146 }, { "epoch": 0.3765078109551117, "step": 3808, "train/sim_loss": 0.01953125 }, { "epoch": 0.3765078109551117, "step": 3808, "train/total_loss": 0.07961652427911758 }, { "entropy": 8.624210357666016, "epoch": 0.37660668380462725, "mean_token_accuracy": 0.7225501537322998, "num_tokens": 20937381.0, "step": 3809, "train/ce_loss": 1.2048296928405762 }, { "epoch": 0.37660668380462725, "step": 3809, "train/sim_loss": 0.10546875 }, { "epoch": 0.37660668380462725, "step": 3809, "train/total_loss": 0.22595173120498657 }, { "entropy": 8.948141098022461, "epoch": 0.3767055566541428, "mean_token_accuracy": 0.7298578023910522, "num_tokens": 20942858.0, "step": 3810, "train/ce_loss": 0.7502251863479614 }, { "epoch": 0.3767055566541428, "step": 3810, "train/sim_loss": 0.04296875 }, { "epoch": 0.3767055566541428, "step": 3810, "train/total_loss": 0.11799126863479614 }, { "entropy": 9.133652687072754, "epoch": 0.3768044295036583, "mean_token_accuracy": 0.7116104960441589, "num_tokens": 20948421.0, "step": 3811, "train/ce_loss": 0.6623936891555786 }, { "epoch": 0.3768044295036583, "step": 3811, "train/sim_loss": 0.046875 }, { "epoch": 0.3768044295036583, "step": 3811, "train/total_loss": 0.1131143718957901 }, { "entropy": 8.922361373901367, "epoch": 0.3769033023531738, "mean_token_accuracy": 0.7602648735046387, "num_tokens": 20953994.0, "step": 3812, "train/ce_loss": 0.8224928975105286 }, { "epoch": 0.3769033023531738, "step": 3812, "train/sim_loss": 0.109375 }, { "epoch": 0.3769033023531738, "step": 3812, "train/total_loss": 0.19162428379058838 }, { "entropy": 8.906381607055664, "epoch": 0.37700217520268936, "mean_token_accuracy": 0.7626146674156189, "num_tokens": 20959433.0, "step": 3813, "train/ce_loss": 0.9079974889755249 }, { "epoch": 0.37700217520268936, "step": 3813, "train/sim_loss": 0.05078125 }, { "epoch": 0.37700217520268936, "step": 3813, "train/total_loss": 0.1415809988975525 }, { "entropy": 8.794591903686523, "epoch": 0.37710104805220485, "mean_token_accuracy": 0.7636986374855042, "num_tokens": 20964947.0, "step": 3814, "train/ce_loss": 0.6311110258102417 }, { "epoch": 0.37710104805220485, "step": 3814, "train/sim_loss": 0.05859375 }, { "epoch": 0.37710104805220485, "step": 3814, "train/total_loss": 0.12170485407114029 }, { "entropy": 8.593782424926758, "epoch": 0.3771999209017204, "mean_token_accuracy": 0.7798941731452942, "num_tokens": 20970544.0, "step": 3815, "train/ce_loss": 0.4259324371814728 }, { "epoch": 0.3771999209017204, "step": 3815, "train/sim_loss": 0.0703125 }, { "epoch": 0.3771999209017204, "step": 3815, "train/total_loss": 0.11290574073791504 }, { "entropy": 9.052928924560547, "epoch": 0.37729879375123593, "mean_token_accuracy": 0.765625, "num_tokens": 20975971.0, "step": 3816, "train/ce_loss": 0.6771499514579773 }, { "epoch": 0.37729879375123593, "step": 3816, "train/sim_loss": 0.046875 }, { "epoch": 0.37729879375123593, "step": 3816, "train/total_loss": 0.11458999663591385 }, { "entropy": 8.80251407623291, "epoch": 0.3773976666007514, "mean_token_accuracy": 0.7327690720558167, "num_tokens": 20981456.0, "step": 3817, "train/ce_loss": 0.5963584780693054 }, { "epoch": 0.3773976666007514, "step": 3817, "train/sim_loss": 0.07421875 }, { "epoch": 0.3773976666007514, "step": 3817, "train/total_loss": 0.13385459780693054 }, { "entropy": 8.62576675415039, "epoch": 0.37749653945026695, "mean_token_accuracy": 0.7212066054344177, "num_tokens": 20987165.0, "step": 3818, "train/ce_loss": 0.4430178701877594 }, { "epoch": 0.37749653945026695, "step": 3818, "train/sim_loss": 0.0234375 }, { "epoch": 0.37749653945026695, "step": 3818, "train/total_loss": 0.06773929297924042 }, { "entropy": 8.725448608398438, "epoch": 0.3775954122997825, "mean_token_accuracy": 0.7220077514648438, "num_tokens": 20992598.0, "step": 3819, "train/ce_loss": 0.9853853583335876 }, { "epoch": 0.3775954122997825, "step": 3819, "train/sim_loss": 0.046875 }, { "epoch": 0.3775954122997825, "step": 3819, "train/total_loss": 0.14541354775428772 }, { "epoch": 0.377694285149298, "grad_norm": 0.8399968147277832, "learning_rate": 9.058250506848638e-06, "loss": 0.1392, "step": 3820 }, { "entropy": 8.767312049865723, "epoch": 0.377694285149298, "mean_token_accuracy": 0.7579281330108643, "num_tokens": 20998160.0, "step": 3820, "train/ce_loss": 0.973065197467804 }, { "epoch": 0.377694285149298, "step": 3820, "train/sim_loss": 0.09765625 }, { "epoch": 0.377694285149298, "step": 3820, "train/total_loss": 0.1949627697467804 }, { "entropy": 9.210311889648438, "epoch": 0.3777931579988135, "mean_token_accuracy": 0.7618364691734314, "num_tokens": 21003474.0, "step": 3821, "train/ce_loss": 0.6366482377052307 }, { "epoch": 0.3777931579988135, "step": 3821, "train/sim_loss": 0.0625 }, { "epoch": 0.3777931579988135, "step": 3821, "train/total_loss": 0.12616482377052307 }, { "entropy": 8.844480514526367, "epoch": 0.37789203084832906, "mean_token_accuracy": 0.8004837036132812, "num_tokens": 21008985.0, "step": 3822, "train/ce_loss": 0.3885408341884613 }, { "epoch": 0.37789203084832906, "step": 3822, "train/sim_loss": 0.0390625 }, { "epoch": 0.37789203084832906, "step": 3822, "train/total_loss": 0.07791658490896225 }, { "entropy": 8.85986328125, "epoch": 0.37799090369784455, "mean_token_accuracy": 0.7387173175811768, "num_tokens": 21014392.0, "step": 3823, "train/ce_loss": 0.5886112451553345 }, { "epoch": 0.37799090369784455, "step": 3823, "train/sim_loss": 0.03125 }, { "epoch": 0.37799090369784455, "step": 3823, "train/total_loss": 0.09011112153530121 }, { "entropy": 9.191099166870117, "epoch": 0.3780897765473601, "mean_token_accuracy": 0.7140992283821106, "num_tokens": 21019805.0, "step": 3824, "train/ce_loss": 0.5851342678070068 }, { "epoch": 0.3780897765473601, "step": 3824, "train/sim_loss": 0.0546875 }, { "epoch": 0.3780897765473601, "step": 3824, "train/total_loss": 0.11320093274116516 }, { "entropy": 9.053877830505371, "epoch": 0.37818864939687563, "mean_token_accuracy": 0.7115839123725891, "num_tokens": 21025167.0, "step": 3825, "train/ce_loss": 0.4774901568889618 }, { "epoch": 0.37818864939687563, "step": 3825, "train/sim_loss": 0.03125 }, { "epoch": 0.37818864939687563, "step": 3825, "train/total_loss": 0.07899901270866394 }, { "entropy": 9.230544090270996, "epoch": 0.3782875222463911, "mean_token_accuracy": 0.7387518286705017, "num_tokens": 21030480.0, "step": 3826, "train/ce_loss": 0.7353373169898987 }, { "epoch": 0.3782875222463911, "step": 3826, "train/sim_loss": 0.046875 }, { "epoch": 0.3782875222463911, "step": 3826, "train/total_loss": 0.12040873616933823 }, { "entropy": 8.920063972473145, "epoch": 0.37838639509590666, "mean_token_accuracy": 0.7428924441337585, "num_tokens": 21035890.0, "step": 3827, "train/ce_loss": 0.725837767124176 }, { "epoch": 0.37838639509590666, "step": 3827, "train/sim_loss": 0.04296875 }, { "epoch": 0.37838639509590666, "step": 3827, "train/total_loss": 0.11555252969264984 }, { "entropy": 9.184647560119629, "epoch": 0.3784852679454222, "mean_token_accuracy": 0.7595628499984741, "num_tokens": 21041184.0, "step": 3828, "train/ce_loss": 0.8798593282699585 }, { "epoch": 0.3784852679454222, "step": 3828, "train/sim_loss": 0.0625 }, { "epoch": 0.3784852679454222, "step": 3828, "train/total_loss": 0.15048593282699585 }, { "entropy": 8.906173706054688, "epoch": 0.3785841407949377, "mean_token_accuracy": 0.7453608512878418, "num_tokens": 21046797.0, "step": 3829, "train/ce_loss": 0.323254257440567 }, { "epoch": 0.3785841407949377, "step": 3829, "train/sim_loss": 0.16796875 }, { "epoch": 0.3785841407949377, "step": 3829, "train/total_loss": 0.20029418170452118 }, { "entropy": 9.072322845458984, "epoch": 0.3786830136444532, "mean_token_accuracy": 0.7272727489471436, "num_tokens": 21052317.0, "step": 3830, "train/ce_loss": 0.5861014723777771 }, { "epoch": 0.3786830136444532, "step": 3830, "train/sim_loss": 0.0546875 }, { "epoch": 0.3786830136444532, "step": 3830, "train/total_loss": 0.11329764872789383 }, { "entropy": 8.632804870605469, "epoch": 0.37878188649396877, "mean_token_accuracy": 0.7409224510192871, "num_tokens": 21057952.0, "step": 3831, "train/ce_loss": 0.7452666759490967 }, { "epoch": 0.37878188649396877, "step": 3831, "train/sim_loss": 0.046875 }, { "epoch": 0.37878188649396877, "step": 3831, "train/total_loss": 0.12140166759490967 }, { "entropy": 9.02867317199707, "epoch": 0.37888075934348425, "mean_token_accuracy": 0.803636372089386, "num_tokens": 21063338.0, "step": 3832, "train/ce_loss": 0.3814040720462799 }, { "epoch": 0.37888075934348425, "step": 3832, "train/sim_loss": 0.02734375 }, { "epoch": 0.37888075934348425, "step": 3832, "train/total_loss": 0.06548415869474411 }, { "entropy": 9.165508270263672, "epoch": 0.3789796321929998, "mean_token_accuracy": 0.7440000176429749, "num_tokens": 21068755.0, "step": 3833, "train/ce_loss": 1.0824581384658813 }, { "epoch": 0.3789796321929998, "step": 3833, "train/sim_loss": 0.0703125 }, { "epoch": 0.3789796321929998, "step": 3833, "train/total_loss": 0.1785583198070526 }, { "entropy": 8.860503196716309, "epoch": 0.37907850504251533, "mean_token_accuracy": 0.6985769867897034, "num_tokens": 21074177.0, "step": 3834, "train/ce_loss": 1.1932843923568726 }, { "epoch": 0.37907850504251533, "step": 3834, "train/sim_loss": 0.08984375 }, { "epoch": 0.37907850504251533, "step": 3834, "train/total_loss": 0.20917218923568726 }, { "entropy": 8.8695068359375, "epoch": 0.3791773778920309, "mean_token_accuracy": 0.7569676637649536, "num_tokens": 21079750.0, "step": 3835, "train/ce_loss": 0.545565664768219 }, { "epoch": 0.3791773778920309, "step": 3835, "train/sim_loss": 0.05859375 }, { "epoch": 0.3791773778920309, "step": 3835, "train/total_loss": 0.11315031349658966 }, { "entropy": 8.772150039672852, "epoch": 0.37927625074154636, "mean_token_accuracy": 0.7773019075393677, "num_tokens": 21085286.0, "step": 3836, "train/ce_loss": 0.632908821105957 }, { "epoch": 0.37927625074154636, "step": 3836, "train/sim_loss": 0.12109375 }, { "epoch": 0.37927625074154636, "step": 3836, "train/total_loss": 0.18438464403152466 }, { "entropy": 8.482931137084961, "epoch": 0.3793751235910619, "mean_token_accuracy": 0.7955596446990967, "num_tokens": 21091051.0, "step": 3837, "train/ce_loss": 0.6580840945243835 }, { "epoch": 0.3793751235910619, "step": 3837, "train/sim_loss": 0.109375 }, { "epoch": 0.3793751235910619, "step": 3837, "train/total_loss": 0.17518341541290283 }, { "entropy": 8.294065475463867, "epoch": 0.37947399644057744, "mean_token_accuracy": 0.7491409182548523, "num_tokens": 21096582.0, "step": 3838, "train/ce_loss": 0.807155191898346 }, { "epoch": 0.37947399644057744, "step": 3838, "train/sim_loss": 0.05078125 }, { "epoch": 0.37947399644057744, "step": 3838, "train/total_loss": 0.13149677217006683 }, { "entropy": 8.932235717773438, "epoch": 0.37957286929009293, "mean_token_accuracy": 0.7668789625167847, "num_tokens": 21102043.0, "step": 3839, "train/ce_loss": 0.523188054561615 }, { "epoch": 0.37957286929009293, "step": 3839, "train/sim_loss": 0.04296875 }, { "epoch": 0.37957286929009293, "step": 3839, "train/total_loss": 0.09528756141662598 }, { "epoch": 0.37967174213960847, "grad_norm": 0.7822229266166687, "learning_rate": 9.05330564209069e-06, "loss": 0.1423, "step": 3840 }, { "entropy": 8.948104858398438, "epoch": 0.37967174213960847, "mean_token_accuracy": 0.6881837844848633, "num_tokens": 21107571.0, "step": 3840, "train/ce_loss": 0.9860690236091614 }, { "epoch": 0.37967174213960847, "step": 3840, "train/sim_loss": 0.0859375 }, { "epoch": 0.37967174213960847, "step": 3840, "train/total_loss": 0.1845444142818451 }, { "entropy": 8.520598411560059, "epoch": 0.379770614989124, "mean_token_accuracy": 0.7622673511505127, "num_tokens": 21113398.0, "step": 3841, "train/ce_loss": 0.7346155047416687 }, { "epoch": 0.379770614989124, "step": 3841, "train/sim_loss": 0.02734375 }, { "epoch": 0.379770614989124, "step": 3841, "train/total_loss": 0.10080530494451523 }, { "entropy": 9.048454284667969, "epoch": 0.3798694878386395, "mean_token_accuracy": 0.7509778141975403, "num_tokens": 21118827.0, "step": 3842, "train/ce_loss": 1.0231587886810303 }, { "epoch": 0.3798694878386395, "step": 3842, "train/sim_loss": 0.05859375 }, { "epoch": 0.3798694878386395, "step": 3842, "train/total_loss": 0.16090962290763855 }, { "entropy": 8.788230895996094, "epoch": 0.37996836068815504, "mean_token_accuracy": 0.8022598624229431, "num_tokens": 21124375.0, "step": 3843, "train/ce_loss": 0.8226823806762695 }, { "epoch": 0.37996836068815504, "step": 3843, "train/sim_loss": 0.06640625 }, { "epoch": 0.37996836068815504, "step": 3843, "train/total_loss": 0.14867448806762695 }, { "entropy": 8.971113204956055, "epoch": 0.3800672335376706, "mean_token_accuracy": 0.7355931997299194, "num_tokens": 21129856.0, "step": 3844, "train/ce_loss": 1.0519373416900635 }, { "epoch": 0.3800672335376706, "step": 3844, "train/sim_loss": 0.05859375 }, { "epoch": 0.3800672335376706, "step": 3844, "train/total_loss": 0.16378748416900635 }, { "entropy": 9.00663948059082, "epoch": 0.38016610638718606, "mean_token_accuracy": 0.7906249761581421, "num_tokens": 21135085.0, "step": 3845, "train/ce_loss": 0.6249416470527649 }, { "epoch": 0.38016610638718606, "step": 3845, "train/sim_loss": 0.0546875 }, { "epoch": 0.38016610638718606, "step": 3845, "train/total_loss": 0.11718166619539261 }, { "entropy": 9.01113224029541, "epoch": 0.3802649792367016, "mean_token_accuracy": 0.7939233779907227, "num_tokens": 21140471.0, "step": 3846, "train/ce_loss": 0.8379577994346619 }, { "epoch": 0.3802649792367016, "step": 3846, "train/sim_loss": 0.06640625 }, { "epoch": 0.3802649792367016, "step": 3846, "train/total_loss": 0.15020203590393066 }, { "entropy": 8.63878059387207, "epoch": 0.38036385208621715, "mean_token_accuracy": 0.7540670037269592, "num_tokens": 21146161.0, "step": 3847, "train/ce_loss": 0.5745722055435181 }, { "epoch": 0.38036385208621715, "step": 3847, "train/sim_loss": 0.02734375 }, { "epoch": 0.38036385208621715, "step": 3847, "train/total_loss": 0.08480097353458405 }, { "entropy": 8.806375503540039, "epoch": 0.38046272493573263, "mean_token_accuracy": 0.7538829445838928, "num_tokens": 21151627.0, "step": 3848, "train/ce_loss": 1.0257337093353271 }, { "epoch": 0.38046272493573263, "step": 3848, "train/sim_loss": 0.0859375 }, { "epoch": 0.38046272493573263, "step": 3848, "train/total_loss": 0.18851086497306824 }, { "entropy": 8.788896560668945, "epoch": 0.3805615977852482, "mean_token_accuracy": 0.774117648601532, "num_tokens": 21157095.0, "step": 3849, "train/ce_loss": 0.4530021846294403 }, { "epoch": 0.3805615977852482, "step": 3849, "train/sim_loss": 0.0390625 }, { "epoch": 0.3805615977852482, "step": 3849, "train/total_loss": 0.08436271548271179 }, { "entropy": 8.844514846801758, "epoch": 0.3806604706347637, "mean_token_accuracy": 0.738070011138916, "num_tokens": 21162714.0, "step": 3850, "train/ce_loss": 0.47606977820396423 }, { "epoch": 0.3806604706347637, "step": 3850, "train/sim_loss": 0.01953125 }, { "epoch": 0.3806604706347637, "step": 3850, "train/total_loss": 0.06713822484016418 }, { "entropy": 8.988143920898438, "epoch": 0.3807593434842792, "mean_token_accuracy": 0.7651296854019165, "num_tokens": 21168054.0, "step": 3851, "train/ce_loss": 0.30144867300987244 }, { "epoch": 0.3807593434842792, "step": 3851, "train/sim_loss": 0.046875 }, { "epoch": 0.3807593434842792, "step": 3851, "train/total_loss": 0.07701987028121948 }, { "entropy": 8.433156967163086, "epoch": 0.38085821633379474, "mean_token_accuracy": 0.6684573888778687, "num_tokens": 21173897.0, "step": 3852, "train/ce_loss": 2.2453715801239014 }, { "epoch": 0.38085821633379474, "step": 3852, "train/sim_loss": 0.09765625 }, { "epoch": 0.38085821633379474, "step": 3852, "train/total_loss": 0.3221934139728546 }, { "entropy": 8.614599227905273, "epoch": 0.3809570891833103, "mean_token_accuracy": 0.727992057800293, "num_tokens": 21179523.0, "step": 3853, "train/ce_loss": 0.8539880514144897 }, { "epoch": 0.3809570891833103, "step": 3853, "train/sim_loss": 0.08203125 }, { "epoch": 0.3809570891833103, "step": 3853, "train/total_loss": 0.1674300581216812 }, { "entropy": 8.844715118408203, "epoch": 0.38105596203282577, "mean_token_accuracy": 0.7578215599060059, "num_tokens": 21184979.0, "step": 3854, "train/ce_loss": 1.223516583442688 }, { "epoch": 0.38105596203282577, "step": 3854, "train/sim_loss": 0.0546875 }, { "epoch": 0.38105596203282577, "step": 3854, "train/total_loss": 0.17703916132450104 }, { "entropy": 8.73879623413086, "epoch": 0.3811548348823413, "mean_token_accuracy": 0.7563600540161133, "num_tokens": 21190709.0, "step": 3855, "train/ce_loss": 1.4137024879455566 }, { "epoch": 0.3811548348823413, "step": 3855, "train/sim_loss": 0.1484375 }, { "epoch": 0.3811548348823413, "step": 3855, "train/total_loss": 0.2898077368736267 }, { "entropy": 9.236467361450195, "epoch": 0.38125370773185685, "mean_token_accuracy": 0.6932953000068665, "num_tokens": 21195922.0, "step": 3856, "train/ce_loss": 1.0469584465026855 }, { "epoch": 0.38125370773185685, "step": 3856, "train/sim_loss": 0.0703125 }, { "epoch": 0.38125370773185685, "step": 3856, "train/total_loss": 0.1750083565711975 }, { "entropy": 8.637985229492188, "epoch": 0.38135258058137234, "mean_token_accuracy": 0.7141304612159729, "num_tokens": 21201378.0, "step": 3857, "train/ce_loss": 0.9348074793815613 }, { "epoch": 0.38135258058137234, "step": 3857, "train/sim_loss": 0.05078125 }, { "epoch": 0.38135258058137234, "step": 3857, "train/total_loss": 0.14426200091838837 }, { "entropy": 8.844162940979004, "epoch": 0.3814514534308879, "mean_token_accuracy": 0.7966291904449463, "num_tokens": 21206870.0, "step": 3858, "train/ce_loss": 0.5107932686805725 }, { "epoch": 0.3814514534308879, "step": 3858, "train/sim_loss": 0.0234375 }, { "epoch": 0.3814514534308879, "step": 3858, "train/total_loss": 0.07451683282852173 }, { "entropy": 8.950641632080078, "epoch": 0.3815503262804034, "mean_token_accuracy": 0.7338281869888306, "num_tokens": 21212444.0, "step": 3859, "train/ce_loss": 0.32091718912124634 }, { "epoch": 0.3815503262804034, "step": 3859, "train/sim_loss": 0.046875 }, { "epoch": 0.3815503262804034, "step": 3859, "train/total_loss": 0.07896672189235687 }, { "epoch": 0.3816491991299189, "grad_norm": 0.8972483277320862, "learning_rate": 9.04836077733274e-06, "loss": 0.1379, "step": 3860 }, { "entropy": 9.034412384033203, "epoch": 0.3816491991299189, "mean_token_accuracy": 0.7699005007743835, "num_tokens": 21217838.0, "step": 3860, "train/ce_loss": 0.7843201160430908 }, { "epoch": 0.3816491991299189, "step": 3860, "train/sim_loss": 0.07421875 }, { "epoch": 0.3816491991299189, "step": 3860, "train/total_loss": 0.15265077352523804 }, { "entropy": 8.758909225463867, "epoch": 0.38174807197943444, "mean_token_accuracy": 0.7454100251197815, "num_tokens": 21223244.0, "step": 3861, "train/ce_loss": 0.7793702483177185 }, { "epoch": 0.38174807197943444, "step": 3861, "train/sim_loss": 0.06640625 }, { "epoch": 0.38174807197943444, "step": 3861, "train/total_loss": 0.1443432867527008 }, { "entropy": 8.757303237915039, "epoch": 0.38184694482895, "mean_token_accuracy": 0.7433333396911621, "num_tokens": 21228783.0, "step": 3862, "train/ce_loss": 0.4476719796657562 }, { "epoch": 0.38184694482895, "step": 3862, "train/sim_loss": 0.06640625 }, { "epoch": 0.38184694482895, "step": 3862, "train/total_loss": 0.11117345094680786 }, { "entropy": 8.950553894042969, "epoch": 0.38194581767846547, "mean_token_accuracy": 0.7311111092567444, "num_tokens": 21234344.0, "step": 3863, "train/ce_loss": 1.5211460590362549 }, { "epoch": 0.38194581767846547, "step": 3863, "train/sim_loss": 0.1015625 }, { "epoch": 0.38194581767846547, "step": 3863, "train/total_loss": 0.2536771297454834 }, { "entropy": 8.707498550415039, "epoch": 0.382044690527981, "mean_token_accuracy": 0.7600950002670288, "num_tokens": 21239829.0, "step": 3864, "train/ce_loss": 1.079357624053955 }, { "epoch": 0.382044690527981, "step": 3864, "train/sim_loss": 0.0703125 }, { "epoch": 0.382044690527981, "step": 3864, "train/total_loss": 0.17824825644493103 }, { "entropy": 8.73147201538086, "epoch": 0.38214356337749655, "mean_token_accuracy": 0.7335359454154968, "num_tokens": 21245467.0, "step": 3865, "train/ce_loss": 1.0302342176437378 }, { "epoch": 0.38214356337749655, "step": 3865, "train/sim_loss": 0.10546875 }, { "epoch": 0.38214356337749655, "step": 3865, "train/total_loss": 0.20849217474460602 }, { "entropy": 8.926312446594238, "epoch": 0.38224243622701204, "mean_token_accuracy": 0.74685138463974, "num_tokens": 21250858.0, "step": 3866, "train/ce_loss": 0.9033011198043823 }, { "epoch": 0.38224243622701204, "step": 3866, "train/sim_loss": 0.07421875 }, { "epoch": 0.38224243622701204, "step": 3866, "train/total_loss": 0.1645488739013672 }, { "entropy": 8.624496459960938, "epoch": 0.3823413090765276, "mean_token_accuracy": 0.6994082927703857, "num_tokens": 21256346.0, "step": 3867, "train/ce_loss": 0.5262025594711304 }, { "epoch": 0.3823413090765276, "step": 3867, "train/sim_loss": 0.05859375 }, { "epoch": 0.3823413090765276, "step": 3867, "train/total_loss": 0.11121401190757751 }, { "entropy": 9.421531677246094, "epoch": 0.3824401819260431, "mean_token_accuracy": 0.7564296722412109, "num_tokens": 21261761.0, "step": 3868, "train/ce_loss": 0.5664533972740173 }, { "epoch": 0.3824401819260431, "step": 3868, "train/sim_loss": 0.0390625 }, { "epoch": 0.3824401819260431, "step": 3868, "train/total_loss": 0.09570784121751785 }, { "entropy": 8.935425758361816, "epoch": 0.3825390547755586, "mean_token_accuracy": 0.7626146674156189, "num_tokens": 21267301.0, "step": 3869, "train/ce_loss": 0.7413113117218018 }, { "epoch": 0.3825390547755586, "step": 3869, "train/sim_loss": 0.078125 }, { "epoch": 0.3825390547755586, "step": 3869, "train/total_loss": 0.15225613117218018 }, { "entropy": 8.7197265625, "epoch": 0.38263792762507415, "mean_token_accuracy": 0.7222777009010315, "num_tokens": 21272931.0, "step": 3870, "train/ce_loss": 0.6525998711585999 }, { "epoch": 0.38263792762507415, "step": 3870, "train/sim_loss": 0.04296875 }, { "epoch": 0.38263792762507415, "step": 3870, "train/total_loss": 0.10822873562574387 }, { "entropy": 9.05545425415039, "epoch": 0.3827368004745897, "mean_token_accuracy": 0.7033492922782898, "num_tokens": 21278408.0, "step": 3871, "train/ce_loss": 1.4014407396316528 }, { "epoch": 0.3827368004745897, "step": 3871, "train/sim_loss": 0.11328125 }, { "epoch": 0.3827368004745897, "step": 3871, "train/total_loss": 0.25342532992362976 }, { "entropy": 8.888392448425293, "epoch": 0.3828356733241052, "mean_token_accuracy": 0.7535714507102966, "num_tokens": 21283913.0, "step": 3872, "train/ce_loss": 1.1179466247558594 }, { "epoch": 0.3828356733241052, "step": 3872, "train/sim_loss": 0.046875 }, { "epoch": 0.3828356733241052, "step": 3872, "train/total_loss": 0.15866966545581818 }, { "entropy": 9.061868667602539, "epoch": 0.3829345461736207, "mean_token_accuracy": 0.7162162065505981, "num_tokens": 21289447.0, "step": 3873, "train/ce_loss": 0.8305484056472778 }, { "epoch": 0.3829345461736207, "step": 3873, "train/sim_loss": 0.10546875 }, { "epoch": 0.3829345461736207, "step": 3873, "train/total_loss": 0.18852359056472778 }, { "entropy": 8.97315502166748, "epoch": 0.38303341902313626, "mean_token_accuracy": 0.7583892345428467, "num_tokens": 21294774.0, "step": 3874, "train/ce_loss": 0.788112461566925 }, { "epoch": 0.38303341902313626, "step": 3874, "train/sim_loss": 0.04296875 }, { "epoch": 0.38303341902313626, "step": 3874, "train/total_loss": 0.12178000062704086 }, { "entropy": 8.764535903930664, "epoch": 0.38313229187265174, "mean_token_accuracy": 0.780379056930542, "num_tokens": 21300309.0, "step": 3875, "train/ce_loss": 0.7740949392318726 }, { "epoch": 0.38313229187265174, "step": 3875, "train/sim_loss": 0.05859375 }, { "epoch": 0.38313229187265174, "step": 3875, "train/total_loss": 0.1360032558441162 }, { "entropy": 8.716911315917969, "epoch": 0.3832311647221673, "mean_token_accuracy": 0.6983349919319153, "num_tokens": 21305880.0, "step": 3876, "train/ce_loss": 2.014498233795166 }, { "epoch": 0.3832311647221673, "step": 3876, "train/sim_loss": 0.0703125 }, { "epoch": 0.3832311647221673, "step": 3876, "train/total_loss": 0.27176231145858765 }, { "entropy": 9.421287536621094, "epoch": 0.3833300375716828, "mean_token_accuracy": 0.7339593172073364, "num_tokens": 21311049.0, "step": 3877, "train/ce_loss": 0.6708891987800598 }, { "epoch": 0.3833300375716828, "step": 3877, "train/sim_loss": 0.05859375 }, { "epoch": 0.3833300375716828, "step": 3877, "train/total_loss": 0.12568268179893494 }, { "entropy": 8.945968627929688, "epoch": 0.38342891042119837, "mean_token_accuracy": 0.7490092515945435, "num_tokens": 21316365.0, "step": 3878, "train/ce_loss": 0.4996739327907562 }, { "epoch": 0.38342891042119837, "step": 3878, "train/sim_loss": 0.12109375 }, { "epoch": 0.38342891042119837, "step": 3878, "train/total_loss": 0.17106114327907562 }, { "entropy": 8.787198066711426, "epoch": 0.38352778327071385, "mean_token_accuracy": 0.7076923251152039, "num_tokens": 21321900.0, "step": 3879, "train/ce_loss": 1.1168222427368164 }, { "epoch": 0.38352778327071385, "step": 3879, "train/sim_loss": 0.1953125 }, { "epoch": 0.38352778327071385, "step": 3879, "train/total_loss": 0.3069947361946106 }, { "epoch": 0.3836266561202294, "grad_norm": 1.062904953956604, "learning_rate": 9.043415912574793e-06, "loss": 0.1522, "step": 3880 }, { "entropy": 8.96767807006836, "epoch": 0.3836266561202294, "mean_token_accuracy": 0.7612903118133545, "num_tokens": 21327373.0, "step": 3880, "train/ce_loss": 0.8104903101921082 }, { "epoch": 0.3836266561202294, "step": 3880, "train/sim_loss": 0.046875 }, { "epoch": 0.3836266561202294, "step": 3880, "train/total_loss": 0.12792402505874634 }, { "entropy": 8.92591381072998, "epoch": 0.38372552896974493, "mean_token_accuracy": 0.7242646813392639, "num_tokens": 21332854.0, "step": 3881, "train/ce_loss": 1.0501210689544678 }, { "epoch": 0.38372552896974493, "step": 3881, "train/sim_loss": 0.0703125 }, { "epoch": 0.38372552896974493, "step": 3881, "train/total_loss": 0.17532461881637573 }, { "entropy": 8.301193237304688, "epoch": 0.3838244018192604, "mean_token_accuracy": 0.7825384140014648, "num_tokens": 21338717.0, "step": 3882, "train/ce_loss": 1.6101077795028687 }, { "epoch": 0.3838244018192604, "step": 3882, "train/sim_loss": 0.07421875 }, { "epoch": 0.3838244018192604, "step": 3882, "train/total_loss": 0.23522953689098358 }, { "entropy": 8.799752235412598, "epoch": 0.38392327466877596, "mean_token_accuracy": 0.7232510447502136, "num_tokens": 21344366.0, "step": 3883, "train/ce_loss": 0.6001482009887695 }, { "epoch": 0.38392327466877596, "step": 3883, "train/sim_loss": 0.08984375 }, { "epoch": 0.38392327466877596, "step": 3883, "train/total_loss": 0.14985856413841248 }, { "entropy": 8.706857681274414, "epoch": 0.3840221475182915, "mean_token_accuracy": 0.746065080165863, "num_tokens": 21349993.0, "step": 3884, "train/ce_loss": 0.6744598150253296 }, { "epoch": 0.3840221475182915, "step": 3884, "train/sim_loss": 0.01953125 }, { "epoch": 0.3840221475182915, "step": 3884, "train/total_loss": 0.08697723597288132 }, { "entropy": 8.857799530029297, "epoch": 0.384121020367807, "mean_token_accuracy": 0.7794594764709473, "num_tokens": 21355558.0, "step": 3885, "train/ce_loss": 0.4659673273563385 }, { "epoch": 0.384121020367807, "step": 3885, "train/sim_loss": 0.0234375 }, { "epoch": 0.384121020367807, "step": 3885, "train/total_loss": 0.07003423571586609 }, { "entropy": 8.957010269165039, "epoch": 0.38421989321732253, "mean_token_accuracy": 0.7548918724060059, "num_tokens": 21361117.0, "step": 3886, "train/ce_loss": 0.3798622786998749 }, { "epoch": 0.38421989321732253, "step": 3886, "train/sim_loss": 0.01953125 }, { "epoch": 0.38421989321732253, "step": 3886, "train/total_loss": 0.05751748010516167 }, { "entropy": 8.83621883392334, "epoch": 0.38431876606683807, "mean_token_accuracy": 0.733564019203186, "num_tokens": 21366641.0, "step": 3887, "train/ce_loss": 0.842145562171936 }, { "epoch": 0.38431876606683807, "step": 3887, "train/sim_loss": 0.0703125 }, { "epoch": 0.38431876606683807, "step": 3887, "train/total_loss": 0.15452706813812256 }, { "entropy": 8.739346504211426, "epoch": 0.38441763891635355, "mean_token_accuracy": 0.7084494233131409, "num_tokens": 21372321.0, "step": 3888, "train/ce_loss": 1.907489538192749 }, { "epoch": 0.38441763891635355, "step": 3888, "train/sim_loss": 0.07421875 }, { "epoch": 0.38441763891635355, "step": 3888, "train/total_loss": 0.2649677097797394 }, { "entropy": 9.031648635864258, "epoch": 0.3845165117658691, "mean_token_accuracy": 0.7402191162109375, "num_tokens": 21377568.0, "step": 3889, "train/ce_loss": 0.6417738199234009 }, { "epoch": 0.3845165117658691, "step": 3889, "train/sim_loss": 0.0625 }, { "epoch": 0.3845165117658691, "step": 3889, "train/total_loss": 0.12667739391326904 }, { "entropy": 9.034571647644043, "epoch": 0.38461538461538464, "mean_token_accuracy": 0.7537922859191895, "num_tokens": 21383075.0, "step": 3890, "train/ce_loss": 0.8706790208816528 }, { "epoch": 0.38461538461538464, "step": 3890, "train/sim_loss": 0.0625 }, { "epoch": 0.38461538461538464, "step": 3890, "train/total_loss": 0.14956790208816528 }, { "entropy": 9.159872055053711, "epoch": 0.3847142574649001, "mean_token_accuracy": 0.777479887008667, "num_tokens": 21388482.0, "step": 3891, "train/ce_loss": 1.0822908878326416 }, { "epoch": 0.3847142574649001, "step": 3891, "train/sim_loss": 0.0703125 }, { "epoch": 0.3847142574649001, "step": 3891, "train/total_loss": 0.17854160070419312 }, { "entropy": 8.611680030822754, "epoch": 0.38481313031441566, "mean_token_accuracy": 0.7490421533584595, "num_tokens": 21394151.0, "step": 3892, "train/ce_loss": 0.5383347272872925 }, { "epoch": 0.38481313031441566, "step": 3892, "train/sim_loss": 0.046875 }, { "epoch": 0.38481313031441566, "step": 3892, "train/total_loss": 0.10070846974849701 }, { "entropy": 8.904855728149414, "epoch": 0.3849120031639312, "mean_token_accuracy": 0.7120370268821716, "num_tokens": 21399760.0, "step": 3893, "train/ce_loss": 0.4568396806716919 }, { "epoch": 0.3849120031639312, "step": 3893, "train/sim_loss": 0.046875 }, { "epoch": 0.3849120031639312, "step": 3893, "train/total_loss": 0.09255896508693695 }, { "entropy": 9.257871627807617, "epoch": 0.3850108760134467, "mean_token_accuracy": 0.7447698712348938, "num_tokens": 21405024.0, "step": 3894, "train/ce_loss": 1.1136523485183716 }, { "epoch": 0.3850108760134467, "step": 3894, "train/sim_loss": 0.07421875 }, { "epoch": 0.3850108760134467, "step": 3894, "train/total_loss": 0.18558397889137268 }, { "entropy": 8.807576179504395, "epoch": 0.38510974886296223, "mean_token_accuracy": 0.7995310425758362, "num_tokens": 21410410.0, "step": 3895, "train/ce_loss": 0.7876639366149902 }, { "epoch": 0.38510974886296223, "step": 3895, "train/sim_loss": 0.0546875 }, { "epoch": 0.38510974886296223, "step": 3895, "train/total_loss": 0.13345390558242798 }, { "entropy": 8.885173797607422, "epoch": 0.3852086217124778, "mean_token_accuracy": 0.7475149035453796, "num_tokens": 21416080.0, "step": 3896, "train/ce_loss": 0.6518584489822388 }, { "epoch": 0.3852086217124778, "step": 3896, "train/sim_loss": 0.046875 }, { "epoch": 0.3852086217124778, "step": 3896, "train/total_loss": 0.11206084489822388 }, { "entropy": 8.702425003051758, "epoch": 0.38530749456199326, "mean_token_accuracy": 0.7508981823921204, "num_tokens": 21421540.0, "step": 3897, "train/ce_loss": 0.5613961219787598 }, { "epoch": 0.38530749456199326, "step": 3897, "train/sim_loss": 0.02734375 }, { "epoch": 0.38530749456199326, "step": 3897, "train/total_loss": 0.08348336815834045 }, { "entropy": 9.24789810180664, "epoch": 0.3854063674115088, "mean_token_accuracy": 0.7455540299415588, "num_tokens": 21426861.0, "step": 3898, "train/ce_loss": 0.35221150517463684 }, { "epoch": 0.3854063674115088, "step": 3898, "train/sim_loss": 0.0234375 }, { "epoch": 0.3854063674115088, "step": 3898, "train/total_loss": 0.0586586520075798 }, { "entropy": 8.810235023498535, "epoch": 0.38550524026102434, "mean_token_accuracy": 0.7890382409095764, "num_tokens": 21432455.0, "step": 3899, "train/ce_loss": 0.8596355319023132 }, { "epoch": 0.38550524026102434, "step": 3899, "train/sim_loss": 0.03125 }, { "epoch": 0.38550524026102434, "step": 3899, "train/total_loss": 0.11721355468034744 }, { "epoch": 0.3856041131105398, "grad_norm": 0.8186221122741699, "learning_rate": 9.038471047816842e-06, "loss": 0.1395, "step": 3900 }, { "entropy": 8.802820205688477, "epoch": 0.3856041131105398, "mean_token_accuracy": 0.7154566645622253, "num_tokens": 21438045.0, "step": 3900, "train/ce_loss": 0.5289274454116821 }, { "epoch": 0.3856041131105398, "step": 3900, "train/sim_loss": 0.08984375 }, { "epoch": 0.3856041131105398, "step": 3900, "train/total_loss": 0.1427364945411682 }, { "entropy": 9.304903984069824, "epoch": 0.38570298596005537, "mean_token_accuracy": 0.7108262181282043, "num_tokens": 21443307.0, "step": 3901, "train/ce_loss": 1.2824786901474 }, { "epoch": 0.38570298596005537, "step": 3901, "train/sim_loss": 0.1015625 }, { "epoch": 0.38570298596005537, "step": 3901, "train/total_loss": 0.22981037199497223 }, { "entropy": 8.64146900177002, "epoch": 0.3858018588095709, "mean_token_accuracy": 0.7247037291526794, "num_tokens": 21449012.0, "step": 3902, "train/ce_loss": 0.7039288878440857 }, { "epoch": 0.3858018588095709, "step": 3902, "train/sim_loss": 0.0390625 }, { "epoch": 0.3858018588095709, "step": 3902, "train/total_loss": 0.10945539176464081 }, { "entropy": 9.326865196228027, "epoch": 0.3859007316590864, "mean_token_accuracy": 0.7265521883964539, "num_tokens": 21454407.0, "step": 3903, "train/ce_loss": 0.865058958530426 }, { "epoch": 0.3859007316590864, "step": 3903, "train/sim_loss": 0.1015625 }, { "epoch": 0.3859007316590864, "step": 3903, "train/total_loss": 0.18806838989257812 }, { "entropy": 8.797103881835938, "epoch": 0.38599960450860193, "mean_token_accuracy": 0.7956867218017578, "num_tokens": 21459864.0, "step": 3904, "train/ce_loss": 0.6070286631584167 }, { "epoch": 0.38599960450860193, "step": 3904, "train/sim_loss": 0.046875 }, { "epoch": 0.38599960450860193, "step": 3904, "train/total_loss": 0.1075778678059578 }, { "entropy": 9.129537582397461, "epoch": 0.3860984773581175, "mean_token_accuracy": 0.7414448857307434, "num_tokens": 21465218.0, "step": 3905, "train/ce_loss": 0.6255884766578674 }, { "epoch": 0.3860984773581175, "step": 3905, "train/sim_loss": 0.0546875 }, { "epoch": 0.3860984773581175, "step": 3905, "train/total_loss": 0.1172463521361351 }, { "entropy": 9.04023265838623, "epoch": 0.38619735020763296, "mean_token_accuracy": 0.7693251371383667, "num_tokens": 21470695.0, "step": 3906, "train/ce_loss": 0.6718564033508301 }, { "epoch": 0.38619735020763296, "step": 3906, "train/sim_loss": 0.0234375 }, { "epoch": 0.38619735020763296, "step": 3906, "train/total_loss": 0.09062314033508301 }, { "entropy": 9.259773254394531, "epoch": 0.3862962230571485, "mean_token_accuracy": 0.7708333134651184, "num_tokens": 21475984.0, "step": 3907, "train/ce_loss": 0.800693154335022 }, { "epoch": 0.3862962230571485, "step": 3907, "train/sim_loss": 0.046875 }, { "epoch": 0.3862962230571485, "step": 3907, "train/total_loss": 0.12694431841373444 }, { "entropy": 8.909191131591797, "epoch": 0.38639509590666404, "mean_token_accuracy": 0.7232846021652222, "num_tokens": 21481536.0, "step": 3908, "train/ce_loss": 1.2879759073257446 }, { "epoch": 0.38639509590666404, "step": 3908, "train/sim_loss": 0.1796875 }, { "epoch": 0.38639509590666404, "step": 3908, "train/total_loss": 0.30848509073257446 }, { "entropy": 8.756535530090332, "epoch": 0.38649396875617953, "mean_token_accuracy": 0.7297872304916382, "num_tokens": 21487038.0, "step": 3909, "train/ce_loss": 0.9316304326057434 }, { "epoch": 0.38649396875617953, "step": 3909, "train/sim_loss": 0.1171875 }, { "epoch": 0.38649396875617953, "step": 3909, "train/total_loss": 0.21035054326057434 }, { "entropy": 8.650131225585938, "epoch": 0.38659284160569507, "mean_token_accuracy": 0.7269841432571411, "num_tokens": 21492603.0, "step": 3910, "train/ce_loss": 0.6294261813163757 }, { "epoch": 0.38659284160569507, "step": 3910, "train/sim_loss": 0.0625 }, { "epoch": 0.38659284160569507, "step": 3910, "train/total_loss": 0.12544262409210205 }, { "entropy": 8.928340911865234, "epoch": 0.3866917144552106, "mean_token_accuracy": 0.7530562281608582, "num_tokens": 21498062.0, "step": 3911, "train/ce_loss": 0.8933002352714539 }, { "epoch": 0.3866917144552106, "step": 3911, "train/sim_loss": 0.0703125 }, { "epoch": 0.3866917144552106, "step": 3911, "train/total_loss": 0.1596425175666809 }, { "entropy": 8.606229782104492, "epoch": 0.3867905873047261, "mean_token_accuracy": 0.7232304811477661, "num_tokens": 21503849.0, "step": 3912, "train/ce_loss": 1.0871928930282593 }, { "epoch": 0.3867905873047261, "step": 3912, "train/sim_loss": 0.09375 }, { "epoch": 0.3867905873047261, "step": 3912, "train/total_loss": 0.20246928930282593 }, { "entropy": 8.940369606018066, "epoch": 0.38688946015424164, "mean_token_accuracy": 0.778205156326294, "num_tokens": 21509289.0, "step": 3913, "train/ce_loss": 0.9635294675827026 }, { "epoch": 0.38688946015424164, "step": 3913, "train/sim_loss": 0.03515625 }, { "epoch": 0.38688946015424164, "step": 3913, "train/total_loss": 0.1315091997385025 }, { "entropy": 8.950922012329102, "epoch": 0.3869883330037572, "mean_token_accuracy": 0.6869871020317078, "num_tokens": 21514711.0, "step": 3914, "train/ce_loss": 0.6258904933929443 }, { "epoch": 0.3869883330037572, "step": 3914, "train/sim_loss": 0.03515625 }, { "epoch": 0.3869883330037572, "step": 3914, "train/total_loss": 0.09774529933929443 }, { "entropy": 8.932623863220215, "epoch": 0.38708720585327266, "mean_token_accuracy": 0.721238911151886, "num_tokens": 21520158.0, "step": 3915, "train/ce_loss": 0.49398061633110046 }, { "epoch": 0.38708720585327266, "step": 3915, "train/sim_loss": 0.03125 }, { "epoch": 0.38708720585327266, "step": 3915, "train/total_loss": 0.08064806461334229 }, { "entropy": 9.061626434326172, "epoch": 0.3871860787027882, "mean_token_accuracy": 0.682741105556488, "num_tokens": 21525605.0, "step": 3916, "train/ce_loss": 0.8660735487937927 }, { "epoch": 0.3871860787027882, "step": 3916, "train/sim_loss": 0.11328125 }, { "epoch": 0.3871860787027882, "step": 3916, "train/total_loss": 0.19988861680030823 }, { "entropy": 8.985416412353516, "epoch": 0.38728495155230375, "mean_token_accuracy": 0.7796609997749329, "num_tokens": 21531088.0, "step": 3917, "train/ce_loss": 0.7165873050689697 }, { "epoch": 0.38728495155230375, "step": 3917, "train/sim_loss": 0.0390625 }, { "epoch": 0.38728495155230375, "step": 3917, "train/total_loss": 0.11072123050689697 }, { "entropy": 9.205558776855469, "epoch": 0.3873838244018193, "mean_token_accuracy": 0.7463087439537048, "num_tokens": 21536407.0, "step": 3918, "train/ce_loss": 1.327136516571045 }, { "epoch": 0.3873838244018193, "step": 3918, "train/sim_loss": 0.1015625 }, { "epoch": 0.3873838244018193, "step": 3918, "train/total_loss": 0.2342761605978012 }, { "entropy": 9.21081256866455, "epoch": 0.3874826972513348, "mean_token_accuracy": 0.771276593208313, "num_tokens": 21541750.0, "step": 3919, "train/ce_loss": 0.5359767079353333 }, { "epoch": 0.3874826972513348, "step": 3919, "train/sim_loss": 0.09375 }, { "epoch": 0.3874826972513348, "step": 3919, "train/total_loss": 0.14734767377376556 }, { "epoch": 0.3875815701008503, "grad_norm": 0.7237203121185303, "learning_rate": 9.033526183058894e-06, "loss": 0.1476, "step": 3920 }, { "entropy": 9.065656661987305, "epoch": 0.3875815701008503, "mean_token_accuracy": 0.7532163858413696, "num_tokens": 21547181.0, "step": 3920, "train/ce_loss": 1.577865719795227 }, { "epoch": 0.3875815701008503, "step": 3920, "train/sim_loss": 0.0703125 }, { "epoch": 0.3875815701008503, "step": 3920, "train/total_loss": 0.22809907793998718 }, { "entropy": 8.965932846069336, "epoch": 0.38768044295036586, "mean_token_accuracy": 0.7165563106536865, "num_tokens": 21552576.0, "step": 3921, "train/ce_loss": 1.136023759841919 }, { "epoch": 0.38768044295036586, "step": 3921, "train/sim_loss": 0.078125 }, { "epoch": 0.38768044295036586, "step": 3921, "train/total_loss": 0.19172737002372742 }, { "entropy": 9.208133697509766, "epoch": 0.38777931579988134, "mean_token_accuracy": 0.7085427045822144, "num_tokens": 21557947.0, "step": 3922, "train/ce_loss": 1.0497748851776123 }, { "epoch": 0.38777931579988134, "step": 3922, "train/sim_loss": 0.0625 }, { "epoch": 0.38777931579988134, "step": 3922, "train/total_loss": 0.16747748851776123 }, { "entropy": 8.951138496398926, "epoch": 0.3878781886493969, "mean_token_accuracy": 0.772885262966156, "num_tokens": 21563437.0, "step": 3923, "train/ce_loss": 0.5004251003265381 }, { "epoch": 0.3878781886493969, "step": 3923, "train/sim_loss": 0.09765625 }, { "epoch": 0.3878781886493969, "step": 3923, "train/total_loss": 0.1476987600326538 }, { "entropy": 8.887250900268555, "epoch": 0.3879770614989124, "mean_token_accuracy": 0.7977142930030823, "num_tokens": 21568970.0, "step": 3924, "train/ce_loss": 0.4992687702178955 }, { "epoch": 0.3879770614989124, "step": 3924, "train/sim_loss": 0.078125 }, { "epoch": 0.3879770614989124, "step": 3924, "train/total_loss": 0.12805187702178955 }, { "entropy": 8.81665325164795, "epoch": 0.3880759343484279, "mean_token_accuracy": 0.736580491065979, "num_tokens": 21574643.0, "step": 3925, "train/ce_loss": 0.7383850812911987 }, { "epoch": 0.3880759343484279, "step": 3925, "train/sim_loss": 0.1015625 }, { "epoch": 0.3880759343484279, "step": 3925, "train/total_loss": 0.1754010021686554 }, { "entropy": 8.920717239379883, "epoch": 0.38817480719794345, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 21580194.0, "step": 3926, "train/ce_loss": 0.6908882856369019 }, { "epoch": 0.38817480719794345, "step": 3926, "train/sim_loss": 0.0703125 }, { "epoch": 0.38817480719794345, "step": 3926, "train/total_loss": 0.13940133154392242 }, { "entropy": 8.830814361572266, "epoch": 0.388273680047459, "mean_token_accuracy": 0.7377777695655823, "num_tokens": 21585743.0, "step": 3927, "train/ce_loss": 1.340330719947815 }, { "epoch": 0.388273680047459, "step": 3927, "train/sim_loss": 0.078125 }, { "epoch": 0.388273680047459, "step": 3927, "train/total_loss": 0.21215806901454926 }, { "entropy": 9.175716400146484, "epoch": 0.3883725528969745, "mean_token_accuracy": 0.7855361700057983, "num_tokens": 21591107.0, "step": 3928, "train/ce_loss": 0.8339326977729797 }, { "epoch": 0.3883725528969745, "step": 3928, "train/sim_loss": 0.046875 }, { "epoch": 0.3883725528969745, "step": 3928, "train/total_loss": 0.13026827573776245 }, { "entropy": 8.850831985473633, "epoch": 0.38847142574649, "mean_token_accuracy": 0.7282728552818298, "num_tokens": 21596723.0, "step": 3929, "train/ce_loss": 1.3745161294937134 }, { "epoch": 0.38847142574649, "step": 3929, "train/sim_loss": 0.0859375 }, { "epoch": 0.38847142574649, "step": 3929, "train/total_loss": 0.22338911890983582 }, { "entropy": 8.918290138244629, "epoch": 0.38857029859600556, "mean_token_accuracy": 0.7361268401145935, "num_tokens": 21602246.0, "step": 3930, "train/ce_loss": 0.8492109775543213 }, { "epoch": 0.38857029859600556, "step": 3930, "train/sim_loss": 0.08203125 }, { "epoch": 0.38857029859600556, "step": 3930, "train/total_loss": 0.16695234179496765 }, { "entropy": 9.257669448852539, "epoch": 0.38866917144552104, "mean_token_accuracy": 0.782608687877655, "num_tokens": 21607642.0, "step": 3931, "train/ce_loss": 0.7860640287399292 }, { "epoch": 0.38866917144552104, "step": 3931, "train/sim_loss": 0.03125 }, { "epoch": 0.38866917144552104, "step": 3931, "train/total_loss": 0.10985640436410904 }, { "entropy": 8.791999816894531, "epoch": 0.3887680442950366, "mean_token_accuracy": 0.6898954510688782, "num_tokens": 21613299.0, "step": 3932, "train/ce_loss": 0.6042277812957764 }, { "epoch": 0.3887680442950366, "step": 3932, "train/sim_loss": 0.015625 }, { "epoch": 0.3887680442950366, "step": 3932, "train/total_loss": 0.07604777812957764 }, { "entropy": 8.785094261169434, "epoch": 0.3888669171445521, "mean_token_accuracy": 0.7416879534721375, "num_tokens": 21618739.0, "step": 3933, "train/ce_loss": 0.7748826742172241 }, { "epoch": 0.3888669171445521, "step": 3933, "train/sim_loss": 0.05859375 }, { "epoch": 0.3888669171445521, "step": 3933, "train/total_loss": 0.1360820233821869 }, { "entropy": 8.93966007232666, "epoch": 0.3889657899940676, "mean_token_accuracy": 0.7590618133544922, "num_tokens": 21624327.0, "step": 3934, "train/ce_loss": 0.5545072555541992 }, { "epoch": 0.3889657899940676, "step": 3934, "train/sim_loss": 0.015625 }, { "epoch": 0.3889657899940676, "step": 3934, "train/total_loss": 0.07107572257518768 }, { "entropy": 9.152774810791016, "epoch": 0.38906466284358315, "mean_token_accuracy": 0.7747092843055725, "num_tokens": 21629682.0, "step": 3935, "train/ce_loss": 1.1799304485321045 }, { "epoch": 0.38906466284358315, "step": 3935, "train/sim_loss": 0.09375 }, { "epoch": 0.38906466284358315, "step": 3935, "train/total_loss": 0.2117430567741394 }, { "entropy": 9.259079933166504, "epoch": 0.3891635356930987, "mean_token_accuracy": 0.7007874250411987, "num_tokens": 21635036.0, "step": 3936, "train/ce_loss": 1.0783183574676514 }, { "epoch": 0.3891635356930987, "step": 3936, "train/sim_loss": 0.109375 }, { "epoch": 0.3891635356930987, "step": 3936, "train/total_loss": 0.21720683574676514 }, { "entropy": 9.288758277893066, "epoch": 0.3892624085426142, "mean_token_accuracy": 0.77173912525177, "num_tokens": 21640410.0, "step": 3937, "train/ce_loss": 0.6935651898384094 }, { "epoch": 0.3892624085426142, "step": 3937, "train/sim_loss": 0.03125 }, { "epoch": 0.3892624085426142, "step": 3937, "train/total_loss": 0.1006065234541893 }, { "entropy": 9.033956527709961, "epoch": 0.3893612813921297, "mean_token_accuracy": 0.7870680093765259, "num_tokens": 21646096.0, "step": 3938, "train/ce_loss": 0.6374540328979492 }, { "epoch": 0.3893612813921297, "step": 3938, "train/sim_loss": 0.01953125 }, { "epoch": 0.3893612813921297, "step": 3938, "train/total_loss": 0.0832766517996788 }, { "entropy": 9.013723373413086, "epoch": 0.38946015424164526, "mean_token_accuracy": 0.7342073917388916, "num_tokens": 21651578.0, "step": 3939, "train/ce_loss": 1.0115725994110107 }, { "epoch": 0.38946015424164526, "step": 3939, "train/sim_loss": 0.09375 }, { "epoch": 0.38946015424164526, "step": 3939, "train/total_loss": 0.1949072629213333 }, { "epoch": 0.38955902709116075, "grad_norm": 0.792891263961792, "learning_rate": 9.028581318300944e-06, "loss": 0.1434, "step": 3940 }, { "entropy": 9.348848342895508, "epoch": 0.38955902709116075, "mean_token_accuracy": 0.7213352918624878, "num_tokens": 21656910.0, "step": 3940, "train/ce_loss": 1.101068377494812 }, { "epoch": 0.38955902709116075, "step": 3940, "train/sim_loss": 0.05859375 }, { "epoch": 0.38955902709116075, "step": 3940, "train/total_loss": 0.16870059072971344 }, { "entropy": 8.767045974731445, "epoch": 0.3896578999406763, "mean_token_accuracy": 0.7465346455574036, "num_tokens": 21662505.0, "step": 3941, "train/ce_loss": 1.0767587423324585 }, { "epoch": 0.3896578999406763, "step": 3941, "train/sim_loss": 0.1015625 }, { "epoch": 0.3896578999406763, "step": 3941, "train/total_loss": 0.20923838019371033 }, { "entropy": 9.036320686340332, "epoch": 0.38975677279019183, "mean_token_accuracy": 0.7564102411270142, "num_tokens": 21668001.0, "step": 3942, "train/ce_loss": 0.43060529232025146 }, { "epoch": 0.38975677279019183, "step": 3942, "train/sim_loss": 0.0234375 }, { "epoch": 0.38975677279019183, "step": 3942, "train/total_loss": 0.06649802625179291 }, { "entropy": 8.654132843017578, "epoch": 0.3898556456397073, "mean_token_accuracy": 0.7067099809646606, "num_tokens": 21673619.0, "step": 3943, "train/ce_loss": 0.78564453125 }, { "epoch": 0.3898556456397073, "step": 3943, "train/sim_loss": 0.0625 }, { "epoch": 0.3898556456397073, "step": 3943, "train/total_loss": 0.14106446504592896 }, { "entropy": 9.038406372070312, "epoch": 0.38995451848922286, "mean_token_accuracy": 0.7606318593025208, "num_tokens": 21679257.0, "step": 3944, "train/ce_loss": 0.7786918878555298 }, { "epoch": 0.38995451848922286, "step": 3944, "train/sim_loss": 0.0625 }, { "epoch": 0.38995451848922286, "step": 3944, "train/total_loss": 0.14036919176578522 }, { "entropy": 8.933156967163086, "epoch": 0.3900533913387384, "mean_token_accuracy": 0.7421171069145203, "num_tokens": 21684682.0, "step": 3945, "train/ce_loss": 0.8051201105117798 }, { "epoch": 0.3900533913387384, "step": 3945, "train/sim_loss": 0.0546875 }, { "epoch": 0.3900533913387384, "step": 3945, "train/total_loss": 0.13519951701164246 }, { "entropy": 9.132551193237305, "epoch": 0.3901522641882539, "mean_token_accuracy": 0.7176981568336487, "num_tokens": 21690207.0, "step": 3946, "train/ce_loss": 0.8522765040397644 }, { "epoch": 0.3901522641882539, "step": 3946, "train/sim_loss": 0.046875 }, { "epoch": 0.3901522641882539, "step": 3946, "train/total_loss": 0.13210265338420868 }, { "entropy": 9.108736038208008, "epoch": 0.3902511370377694, "mean_token_accuracy": 0.733418345451355, "num_tokens": 21695626.0, "step": 3947, "train/ce_loss": 1.1510869264602661 }, { "epoch": 0.3902511370377694, "step": 3947, "train/sim_loss": 0.1015625 }, { "epoch": 0.3902511370377694, "step": 3947, "train/total_loss": 0.2166711986064911 }, { "entropy": 9.249579429626465, "epoch": 0.39035000988728497, "mean_token_accuracy": 0.7466307282447815, "num_tokens": 21700922.0, "step": 3948, "train/ce_loss": 0.7796293497085571 }, { "epoch": 0.39035000988728497, "step": 3948, "train/sim_loss": 0.046875 }, { "epoch": 0.39035000988728497, "step": 3948, "train/total_loss": 0.12483793497085571 }, { "entropy": 9.12987232208252, "epoch": 0.39044888273680045, "mean_token_accuracy": 0.7167235612869263, "num_tokens": 21706412.0, "step": 3949, "train/ce_loss": 0.9599942564964294 }, { "epoch": 0.39044888273680045, "step": 3949, "train/sim_loss": 0.1015625 }, { "epoch": 0.39044888273680045, "step": 3949, "train/total_loss": 0.19756191968917847 }, { "entropy": 8.806581497192383, "epoch": 0.390547755586316, "mean_token_accuracy": 0.7763158082962036, "num_tokens": 21712011.0, "step": 3950, "train/ce_loss": 0.5217951536178589 }, { "epoch": 0.390547755586316, "step": 3950, "train/sim_loss": 0.046875 }, { "epoch": 0.390547755586316, "step": 3950, "train/total_loss": 0.09905451536178589 }, { "entropy": 8.90499496459961, "epoch": 0.39064662843583153, "mean_token_accuracy": 0.7266880869865417, "num_tokens": 21717382.0, "step": 3951, "train/ce_loss": 1.08430814743042 }, { "epoch": 0.39064662843583153, "step": 3951, "train/sim_loss": 0.046875 }, { "epoch": 0.39064662843583153, "step": 3951, "train/total_loss": 0.15530581772327423 }, { "entropy": 8.760534286499023, "epoch": 0.390745501285347, "mean_token_accuracy": 0.7421441674232483, "num_tokens": 21723067.0, "step": 3952, "train/ce_loss": 0.4404027760028839 }, { "epoch": 0.390745501285347, "step": 3952, "train/sim_loss": 0.07421875 }, { "epoch": 0.390745501285347, "step": 3952, "train/total_loss": 0.11825902760028839 }, { "entropy": 9.088045120239258, "epoch": 0.39084437413486256, "mean_token_accuracy": 0.6983094811439514, "num_tokens": 21728508.0, "step": 3953, "train/ce_loss": 0.6687819957733154 }, { "epoch": 0.39084437413486256, "step": 3953, "train/sim_loss": 0.08203125 }, { "epoch": 0.39084437413486256, "step": 3953, "train/total_loss": 0.14890944957733154 }, { "entropy": 8.972650527954102, "epoch": 0.3909432469843781, "mean_token_accuracy": 0.7472647428512573, "num_tokens": 21734068.0, "step": 3954, "train/ce_loss": 0.5897306203842163 }, { "epoch": 0.3909432469843781, "step": 3954, "train/sim_loss": 0.01953125 }, { "epoch": 0.3909432469843781, "step": 3954, "train/total_loss": 0.07850430905818939 }, { "entropy": 8.908829689025879, "epoch": 0.3910421198338936, "mean_token_accuracy": 0.7413997650146484, "num_tokens": 21739542.0, "step": 3955, "train/ce_loss": 0.7087785601615906 }, { "epoch": 0.3910421198338936, "step": 3955, "train/sim_loss": 0.046875 }, { "epoch": 0.3910421198338936, "step": 3955, "train/total_loss": 0.11775285750627518 }, { "entropy": 8.954082489013672, "epoch": 0.3911409926834091, "mean_token_accuracy": 0.7660044431686401, "num_tokens": 21745031.0, "step": 3956, "train/ce_loss": 0.6265463829040527 }, { "epoch": 0.3911409926834091, "step": 3956, "train/sim_loss": 0.03515625 }, { "epoch": 0.3911409926834091, "step": 3956, "train/total_loss": 0.09781088680028915 }, { "entropy": 9.064226150512695, "epoch": 0.39123986553292467, "mean_token_accuracy": 0.7643051743507385, "num_tokens": 21750388.0, "step": 3957, "train/ce_loss": 0.5722399353981018 }, { "epoch": 0.39123986553292467, "step": 3957, "train/sim_loss": 0.01953125 }, { "epoch": 0.39123986553292467, "step": 3957, "train/total_loss": 0.07675524055957794 }, { "entropy": 8.597301483154297, "epoch": 0.39133873838244015, "mean_token_accuracy": 0.7235067486763, "num_tokens": 21755975.0, "step": 3958, "train/ce_loss": 0.46440455317497253 }, { "epoch": 0.39133873838244015, "step": 3958, "train/sim_loss": 0.046875 }, { "epoch": 0.39133873838244015, "step": 3958, "train/total_loss": 0.09331545233726501 }, { "entropy": 8.709820747375488, "epoch": 0.3914376112319557, "mean_token_accuracy": 0.7178968787193298, "num_tokens": 21761568.0, "step": 3959, "train/ce_loss": 1.0960783958435059 }, { "epoch": 0.3914376112319557, "step": 3959, "train/sim_loss": 0.078125 }, { "epoch": 0.3914376112319557, "step": 3959, "train/total_loss": 0.18773284554481506 }, { "epoch": 0.39153648408147124, "grad_norm": 0.746030330657959, "learning_rate": 9.023636453542997e-06, "loss": 0.1424, "step": 3960 }, { "entropy": 8.839244842529297, "epoch": 0.39153648408147124, "mean_token_accuracy": 0.7789473533630371, "num_tokens": 21767034.0, "step": 3960, "train/ce_loss": 0.5408331751823425 }, { "epoch": 0.39153648408147124, "step": 3960, "train/sim_loss": 0.05078125 }, { "epoch": 0.39153648408147124, "step": 3960, "train/total_loss": 0.10486456751823425 }, { "entropy": 8.845893859863281, "epoch": 0.3916353569309868, "mean_token_accuracy": 0.7833753228187561, "num_tokens": 21772488.0, "step": 3961, "train/ce_loss": 0.6815752983093262 }, { "epoch": 0.3916353569309868, "step": 3961, "train/sim_loss": 0.01171875 }, { "epoch": 0.3916353569309868, "step": 3961, "train/total_loss": 0.07987628132104874 }, { "entropy": 8.64979362487793, "epoch": 0.39173422978050226, "mean_token_accuracy": 0.7377521395683289, "num_tokens": 21778145.0, "step": 3962, "train/ce_loss": 1.005744457244873 }, { "epoch": 0.39173422978050226, "step": 3962, "train/sim_loss": 0.06640625 }, { "epoch": 0.39173422978050226, "step": 3962, "train/total_loss": 0.16698069870471954 }, { "entropy": 9.14356803894043, "epoch": 0.3918331026300178, "mean_token_accuracy": 0.7818877696990967, "num_tokens": 21783507.0, "step": 3963, "train/ce_loss": 0.7642431259155273 }, { "epoch": 0.3918331026300178, "step": 3963, "train/sim_loss": 0.0546875 }, { "epoch": 0.3918331026300178, "step": 3963, "train/total_loss": 0.13111181557178497 }, { "entropy": 9.086061477661133, "epoch": 0.39193197547953335, "mean_token_accuracy": 0.7687651515007019, "num_tokens": 21788960.0, "step": 3964, "train/ce_loss": 0.5597928166389465 }, { "epoch": 0.39193197547953335, "step": 3964, "train/sim_loss": 0.0234375 }, { "epoch": 0.39193197547953335, "step": 3964, "train/total_loss": 0.07941678166389465 }, { "entropy": 9.319067001342773, "epoch": 0.39203084832904883, "mean_token_accuracy": 0.7583465576171875, "num_tokens": 21794233.0, "step": 3965, "train/ce_loss": 0.6416122913360596 }, { "epoch": 0.39203084832904883, "step": 3965, "train/sim_loss": 0.0625 }, { "epoch": 0.39203084832904883, "step": 3965, "train/total_loss": 0.1266612410545349 }, { "entropy": 9.115740776062012, "epoch": 0.39212972117856437, "mean_token_accuracy": 0.8056265711784363, "num_tokens": 21799617.0, "step": 3966, "train/ce_loss": 0.8833128213882446 }, { "epoch": 0.39212972117856437, "step": 3966, "train/sim_loss": 0.015625 }, { "epoch": 0.39212972117856437, "step": 3966, "train/total_loss": 0.10395628213882446 }, { "entropy": 9.045942306518555, "epoch": 0.3922285940280799, "mean_token_accuracy": 0.7458704113960266, "num_tokens": 21805037.0, "step": 3967, "train/ce_loss": 0.4274313747882843 }, { "epoch": 0.3922285940280799, "step": 3967, "train/sim_loss": 0.0234375 }, { "epoch": 0.3922285940280799, "step": 3967, "train/total_loss": 0.06618063896894455 }, { "entropy": 8.557735443115234, "epoch": 0.3923274668775954, "mean_token_accuracy": 0.7411290407180786, "num_tokens": 21810857.0, "step": 3968, "train/ce_loss": 0.3996378779411316 }, { "epoch": 0.3923274668775954, "step": 3968, "train/sim_loss": 0.046875 }, { "epoch": 0.3923274668775954, "step": 3968, "train/total_loss": 0.08683878928422928 }, { "entropy": 8.849931716918945, "epoch": 0.39242633972711094, "mean_token_accuracy": 0.7730061411857605, "num_tokens": 21816254.0, "step": 3969, "train/ce_loss": 0.5527231693267822 }, { "epoch": 0.39242633972711094, "step": 3969, "train/sim_loss": 0.0390625 }, { "epoch": 0.39242633972711094, "step": 3969, "train/total_loss": 0.09433481842279434 }, { "entropy": 9.045767784118652, "epoch": 0.3925252125766265, "mean_token_accuracy": 0.7754077911376953, "num_tokens": 21821685.0, "step": 3970, "train/ce_loss": 0.7027701139450073 }, { "epoch": 0.3925252125766265, "step": 3970, "train/sim_loss": 0.0546875 }, { "epoch": 0.3925252125766265, "step": 3970, "train/total_loss": 0.12496451288461685 }, { "entropy": 8.806802749633789, "epoch": 0.39262408542614197, "mean_token_accuracy": 0.6961451172828674, "num_tokens": 21827179.0, "step": 3971, "train/ce_loss": 0.7233871221542358 }, { "epoch": 0.39262408542614197, "step": 3971, "train/sim_loss": 0.0625 }, { "epoch": 0.39262408542614197, "step": 3971, "train/total_loss": 0.13483871519565582 }, { "entropy": 9.080485343933105, "epoch": 0.3927229582756575, "mean_token_accuracy": 0.7732240557670593, "num_tokens": 21832517.0, "step": 3972, "train/ce_loss": 0.547297477722168 }, { "epoch": 0.3927229582756575, "step": 3972, "train/sim_loss": 0.03125 }, { "epoch": 0.3927229582756575, "step": 3972, "train/total_loss": 0.08597974479198456 }, { "entropy": 9.245564460754395, "epoch": 0.39282183112517305, "mean_token_accuracy": 0.7430939078330994, "num_tokens": 21837784.0, "step": 3973, "train/ce_loss": 1.0268205404281616 }, { "epoch": 0.39282183112517305, "step": 3973, "train/sim_loss": 0.0703125 }, { "epoch": 0.39282183112517305, "step": 3973, "train/total_loss": 0.17299455404281616 }, { "entropy": 8.621062278747559, "epoch": 0.39292070397468853, "mean_token_accuracy": 0.6832946538925171, "num_tokens": 21843281.0, "step": 3974, "train/ce_loss": 1.241938829421997 }, { "epoch": 0.39292070397468853, "step": 3974, "train/sim_loss": 0.0546875 }, { "epoch": 0.39292070397468853, "step": 3974, "train/total_loss": 0.17888137698173523 }, { "entropy": 9.245319366455078, "epoch": 0.3930195768242041, "mean_token_accuracy": 0.7021530866622925, "num_tokens": 21848750.0, "step": 3975, "train/ce_loss": 1.3521819114685059 }, { "epoch": 0.3930195768242041, "step": 3975, "train/sim_loss": 0.08203125 }, { "epoch": 0.3930195768242041, "step": 3975, "train/total_loss": 0.21724943816661835 }, { "entropy": 9.032548904418945, "epoch": 0.3931184496737196, "mean_token_accuracy": 0.7074074149131775, "num_tokens": 21854054.0, "step": 3976, "train/ce_loss": 0.8338217735290527 }, { "epoch": 0.3931184496737196, "step": 3976, "train/sim_loss": 0.0546875 }, { "epoch": 0.3931184496737196, "step": 3976, "train/total_loss": 0.13806968927383423 }, { "entropy": 9.324067115783691, "epoch": 0.3932173225232351, "mean_token_accuracy": 0.7620286345481873, "num_tokens": 21859362.0, "step": 3977, "train/ce_loss": 0.9925889372825623 }, { "epoch": 0.3932173225232351, "step": 3977, "train/sim_loss": 0.0625 }, { "epoch": 0.3932173225232351, "step": 3977, "train/total_loss": 0.1617588996887207 }, { "entropy": 8.952319145202637, "epoch": 0.39331619537275064, "mean_token_accuracy": 0.7630661725997925, "num_tokens": 21864831.0, "step": 3978, "train/ce_loss": 0.8762582540512085 }, { "epoch": 0.39331619537275064, "step": 3978, "train/sim_loss": 0.046875 }, { "epoch": 0.39331619537275064, "step": 3978, "train/total_loss": 0.13450083136558533 }, { "entropy": 9.28736400604248, "epoch": 0.3934150682222662, "mean_token_accuracy": 0.7659863829612732, "num_tokens": 21870400.0, "step": 3979, "train/ce_loss": 0.7665682435035706 }, { "epoch": 0.3934150682222662, "step": 3979, "train/sim_loss": 0.06640625 }, { "epoch": 0.3934150682222662, "step": 3979, "train/total_loss": 0.14306306838989258 }, { "epoch": 0.39351394107178167, "grad_norm": 0.746255099773407, "learning_rate": 9.018691588785047e-06, "loss": 0.1345, "step": 3980 }, { "entropy": 8.942122459411621, "epoch": 0.39351394107178167, "mean_token_accuracy": 0.7699999809265137, "num_tokens": 21875973.0, "step": 3980, "train/ce_loss": 0.7866836190223694 }, { "epoch": 0.39351394107178167, "step": 3980, "train/sim_loss": 0.09765625 }, { "epoch": 0.39351394107178167, "step": 3980, "train/total_loss": 0.17632460594177246 }, { "entropy": 9.421318054199219, "epoch": 0.3936128139212972, "mean_token_accuracy": 0.6981919407844543, "num_tokens": 21881329.0, "step": 3981, "train/ce_loss": 0.5948910117149353 }, { "epoch": 0.3936128139212972, "step": 3981, "train/sim_loss": 0.02734375 }, { "epoch": 0.3936128139212972, "step": 3981, "train/total_loss": 0.08683285117149353 }, { "entropy": 8.873405456542969, "epoch": 0.39371168677081275, "mean_token_accuracy": 0.7712137699127197, "num_tokens": 21886928.0, "step": 3982, "train/ce_loss": 0.4375554621219635 }, { "epoch": 0.39371168677081275, "step": 3982, "train/sim_loss": 0.05078125 }, { "epoch": 0.39371168677081275, "step": 3982, "train/total_loss": 0.09453679621219635 }, { "entropy": 9.150245666503906, "epoch": 0.39381055962032824, "mean_token_accuracy": 0.7502850890159607, "num_tokens": 21892452.0, "step": 3983, "train/ce_loss": 0.4543621838092804 }, { "epoch": 0.39381055962032824, "step": 3983, "train/sim_loss": 0.06640625 }, { "epoch": 0.39381055962032824, "step": 3983, "train/total_loss": 0.11184246838092804 }, { "entropy": 8.91325855255127, "epoch": 0.3939094324698438, "mean_token_accuracy": 0.7428571581840515, "num_tokens": 21898040.0, "step": 3984, "train/ce_loss": 1.1813642978668213 }, { "epoch": 0.3939094324698438, "step": 3984, "train/sim_loss": 0.078125 }, { "epoch": 0.3939094324698438, "step": 3984, "train/total_loss": 0.1962614357471466 }, { "entropy": 8.72839069366455, "epoch": 0.3940083053193593, "mean_token_accuracy": 0.7388535141944885, "num_tokens": 21903606.0, "step": 3985, "train/ce_loss": 0.6135731339454651 }, { "epoch": 0.3940083053193593, "step": 3985, "train/sim_loss": 0.07421875 }, { "epoch": 0.3940083053193593, "step": 3985, "train/total_loss": 0.135576069355011 }, { "entropy": 9.19976806640625, "epoch": 0.3941071781688748, "mean_token_accuracy": 0.7281292080879211, "num_tokens": 21908924.0, "step": 3986, "train/ce_loss": 0.6096265912055969 }, { "epoch": 0.3941071781688748, "step": 3986, "train/sim_loss": 0.04296875 }, { "epoch": 0.3941071781688748, "step": 3986, "train/total_loss": 0.10393141210079193 }, { "entropy": 8.536641120910645, "epoch": 0.39420605101839035, "mean_token_accuracy": 0.7614313960075378, "num_tokens": 21914566.0, "step": 3987, "train/ce_loss": 0.7680538892745972 }, { "epoch": 0.39420605101839035, "step": 3987, "train/sim_loss": 0.0625 }, { "epoch": 0.39420605101839035, "step": 3987, "train/total_loss": 0.13930538296699524 }, { "entropy": 9.183318138122559, "epoch": 0.3943049238679059, "mean_token_accuracy": 0.6678614020347595, "num_tokens": 21920009.0, "step": 3988, "train/ce_loss": 0.9419891238212585 }, { "epoch": 0.3943049238679059, "step": 3988, "train/sim_loss": 0.05078125 }, { "epoch": 0.3943049238679059, "step": 3988, "train/total_loss": 0.14498016238212585 }, { "entropy": 8.600053787231445, "epoch": 0.3944037967174214, "mean_token_accuracy": 0.7401869297027588, "num_tokens": 21925682.0, "step": 3989, "train/ce_loss": 0.6278889775276184 }, { "epoch": 0.3944037967174214, "step": 3989, "train/sim_loss": 0.08203125 }, { "epoch": 0.3944037967174214, "step": 3989, "train/total_loss": 0.14482015371322632 }, { "entropy": 9.106494903564453, "epoch": 0.3945026695669369, "mean_token_accuracy": 0.7867646813392639, "num_tokens": 21931081.0, "step": 3990, "train/ce_loss": 0.6524695754051208 }, { "epoch": 0.3945026695669369, "step": 3990, "train/sim_loss": 0.078125 }, { "epoch": 0.3945026695669369, "step": 3990, "train/total_loss": 0.14337196946144104 }, { "entropy": 9.087204933166504, "epoch": 0.39460154241645246, "mean_token_accuracy": 0.7589040994644165, "num_tokens": 21936427.0, "step": 3991, "train/ce_loss": 0.9414315223693848 }, { "epoch": 0.39460154241645246, "step": 3991, "train/sim_loss": 0.04296875 }, { "epoch": 0.39460154241645246, "step": 3991, "train/total_loss": 0.13711190223693848 }, { "entropy": 8.774184226989746, "epoch": 0.39470041526596794, "mean_token_accuracy": 0.7754892110824585, "num_tokens": 21942090.0, "step": 3992, "train/ce_loss": 0.8487780690193176 }, { "epoch": 0.39470041526596794, "step": 3992, "train/sim_loss": 0.1015625 }, { "epoch": 0.39470041526596794, "step": 3992, "train/total_loss": 0.18644031882286072 }, { "entropy": 8.927021026611328, "epoch": 0.3947992881154835, "mean_token_accuracy": 0.7111681699752808, "num_tokens": 21947465.0, "step": 3993, "train/ce_loss": 1.1804927587509155 }, { "epoch": 0.3947992881154835, "step": 3993, "train/sim_loss": 0.10546875 }, { "epoch": 0.3947992881154835, "step": 3993, "train/total_loss": 0.2235180288553238 }, { "entropy": 8.835959434509277, "epoch": 0.394898160964999, "mean_token_accuracy": 0.6662665009498596, "num_tokens": 21952906.0, "step": 3994, "train/ce_loss": 1.24826180934906 }, { "epoch": 0.394898160964999, "step": 3994, "train/sim_loss": 0.0390625 }, { "epoch": 0.394898160964999, "step": 3994, "train/total_loss": 0.16388869285583496 }, { "entropy": 9.006898880004883, "epoch": 0.3949970338145145, "mean_token_accuracy": 0.7237308025360107, "num_tokens": 21958253.0, "step": 3995, "train/ce_loss": 0.795710027217865 }, { "epoch": 0.3949970338145145, "step": 3995, "train/sim_loss": 0.0390625 }, { "epoch": 0.3949970338145145, "step": 3995, "train/total_loss": 0.11863350123167038 }, { "entropy": 9.031170845031738, "epoch": 0.39509590666403005, "mean_token_accuracy": 0.7637698650360107, "num_tokens": 21963677.0, "step": 3996, "train/ce_loss": 0.9216145873069763 }, { "epoch": 0.39509590666403005, "step": 3996, "train/sim_loss": 0.09375 }, { "epoch": 0.39509590666403005, "step": 3996, "train/total_loss": 0.18591146171092987 }, { "entropy": 9.218181610107422, "epoch": 0.3951947795135456, "mean_token_accuracy": 0.7334200143814087, "num_tokens": 21968972.0, "step": 3997, "train/ce_loss": 0.7081965804100037 }, { "epoch": 0.3951947795135456, "step": 3997, "train/sim_loss": 0.06640625 }, { "epoch": 0.3951947795135456, "step": 3997, "train/total_loss": 0.1372259110212326 }, { "entropy": 8.71298885345459, "epoch": 0.3952936523630611, "mean_token_accuracy": 0.759381890296936, "num_tokens": 21974529.0, "step": 3998, "train/ce_loss": 0.32262593507766724 }, { "epoch": 0.3952936523630611, "step": 3998, "train/sim_loss": 0.0546875 }, { "epoch": 0.3952936523630611, "step": 3998, "train/total_loss": 0.08695009350776672 }, { "entropy": 8.963014602661133, "epoch": 0.3953925252125766, "mean_token_accuracy": 0.777063250541687, "num_tokens": 21980060.0, "step": 3999, "train/ce_loss": 0.7161266207695007 }, { "epoch": 0.3953925252125766, "step": 3999, "train/sim_loss": 0.0625 }, { "epoch": 0.3953925252125766, "step": 3999, "train/total_loss": 0.1341126561164856 }, { "epoch": 0.39549139806209216, "grad_norm": 0.6682150363922119, "learning_rate": 9.013746724027098e-06, "loss": 0.1495, "step": 4000 }, { "entropy": 8.96671199798584, "epoch": 0.39549139806209216, "mean_token_accuracy": 0.75944584608078, "num_tokens": 21985435.0, "step": 4000, "train/ce_loss": 0.873297929763794 }, { "epoch": 0.39549139806209216, "step": 4000, "train/sim_loss": 0.0703125 }, { "epoch": 0.39549139806209216, "step": 4000, "train/total_loss": 0.15764230489730835 }, { "entropy": 8.740777015686035, "epoch": 0.3955902709116077, "mean_token_accuracy": 0.7608926892280579, "num_tokens": 21991067.0, "step": 4001, "train/ce_loss": 0.9379466772079468 }, { "epoch": 0.3955902709116077, "step": 4001, "train/sim_loss": 0.08984375 }, { "epoch": 0.3955902709116077, "step": 4001, "train/total_loss": 0.18363842368125916 }, { "entropy": 9.076741218566895, "epoch": 0.3956891437611232, "mean_token_accuracy": 0.7624161243438721, "num_tokens": 21996348.0, "step": 4002, "train/ce_loss": 0.7792680263519287 }, { "epoch": 0.3956891437611232, "step": 4002, "train/sim_loss": 0.0546875 }, { "epoch": 0.3956891437611232, "step": 4002, "train/total_loss": 0.13261431455612183 }, { "entropy": 8.859529495239258, "epoch": 0.3957880166106387, "mean_token_accuracy": 0.7347174286842346, "num_tokens": 22001820.0, "step": 4003, "train/ce_loss": 0.4477638602256775 }, { "epoch": 0.3957880166106387, "step": 4003, "train/sim_loss": 0.0546875 }, { "epoch": 0.3957880166106387, "step": 4003, "train/total_loss": 0.09946388751268387 }, { "entropy": 9.38194465637207, "epoch": 0.39588688946015427, "mean_token_accuracy": 0.7427745461463928, "num_tokens": 22007129.0, "step": 4004, "train/ce_loss": 0.8379200100898743 }, { "epoch": 0.39588688946015427, "step": 4004, "train/sim_loss": 0.05078125 }, { "epoch": 0.39588688946015427, "step": 4004, "train/total_loss": 0.13457325100898743 }, { "entropy": 8.896053314208984, "epoch": 0.39598576230966975, "mean_token_accuracy": 0.7294900417327881, "num_tokens": 22012623.0, "step": 4005, "train/ce_loss": 0.5559148788452148 }, { "epoch": 0.39598576230966975, "step": 4005, "train/sim_loss": 0.078125 }, { "epoch": 0.39598576230966975, "step": 4005, "train/total_loss": 0.13371649384498596 }, { "entropy": 8.8294038772583, "epoch": 0.3960846351591853, "mean_token_accuracy": 0.7536842226982117, "num_tokens": 22018283.0, "step": 4006, "train/ce_loss": 0.7396324872970581 }, { "epoch": 0.3960846351591853, "step": 4006, "train/sim_loss": 0.08984375 }, { "epoch": 0.3960846351591853, "step": 4006, "train/total_loss": 0.1638070046901703 }, { "entropy": 9.100353240966797, "epoch": 0.39618350800870084, "mean_token_accuracy": 0.7503234148025513, "num_tokens": 22023621.0, "step": 4007, "train/ce_loss": 0.7225068807601929 }, { "epoch": 0.39618350800870084, "step": 4007, "train/sim_loss": 0.05078125 }, { "epoch": 0.39618350800870084, "step": 4007, "train/total_loss": 0.12303193658590317 }, { "entropy": 8.906133651733398, "epoch": 0.3962823808582163, "mean_token_accuracy": 0.7939646244049072, "num_tokens": 22029212.0, "step": 4008, "train/ce_loss": 0.8344955444335938 }, { "epoch": 0.3962823808582163, "step": 4008, "train/sim_loss": 0.08203125 }, { "epoch": 0.3962823808582163, "step": 4008, "train/total_loss": 0.1654808074235916 }, { "entropy": 8.77525520324707, "epoch": 0.39638125370773186, "mean_token_accuracy": 0.7336734533309937, "num_tokens": 22034820.0, "step": 4009, "train/ce_loss": 0.7943027019500732 }, { "epoch": 0.39638125370773186, "step": 4009, "train/sim_loss": 0.07421875 }, { "epoch": 0.39638125370773186, "step": 4009, "train/total_loss": 0.15364903211593628 }, { "entropy": 8.960542678833008, "epoch": 0.3964801265572474, "mean_token_accuracy": 0.6916342377662659, "num_tokens": 22040389.0, "step": 4010, "train/ce_loss": 1.0151580572128296 }, { "epoch": 0.3964801265572474, "step": 4010, "train/sim_loss": 0.13671875 }, { "epoch": 0.3964801265572474, "step": 4010, "train/total_loss": 0.23823454976081848 }, { "entropy": 9.015408515930176, "epoch": 0.3965789994067629, "mean_token_accuracy": 0.7610418796539307, "num_tokens": 22045891.0, "step": 4011, "train/ce_loss": 0.5547149777412415 }, { "epoch": 0.3965789994067629, "step": 4011, "train/sim_loss": 0.0390625 }, { "epoch": 0.3965789994067629, "step": 4011, "train/total_loss": 0.0945339947938919 }, { "entropy": 9.003637313842773, "epoch": 0.39667787225627843, "mean_token_accuracy": 0.7476415038108826, "num_tokens": 22051358.0, "step": 4012, "train/ce_loss": 0.8419812321662903 }, { "epoch": 0.39667787225627843, "step": 4012, "train/sim_loss": 0.0546875 }, { "epoch": 0.39667787225627843, "step": 4012, "train/total_loss": 0.13888561725616455 }, { "entropy": 9.087237358093262, "epoch": 0.39677674510579397, "mean_token_accuracy": 0.7443609237670898, "num_tokens": 22056733.0, "step": 4013, "train/ce_loss": 0.7785549759864807 }, { "epoch": 0.39677674510579397, "step": 4013, "train/sim_loss": 0.046875 }, { "epoch": 0.39677674510579397, "step": 4013, "train/total_loss": 0.12473049759864807 }, { "entropy": 8.587739944458008, "epoch": 0.39687561795530946, "mean_token_accuracy": 0.7357894778251648, "num_tokens": 22062305.0, "step": 4014, "train/ce_loss": 0.9409742951393127 }, { "epoch": 0.39687561795530946, "step": 4014, "train/sim_loss": 0.04296875 }, { "epoch": 0.39687561795530946, "step": 4014, "train/total_loss": 0.13706618547439575 }, { "entropy": 8.980234146118164, "epoch": 0.396974490804825, "mean_token_accuracy": 0.7376654744148254, "num_tokens": 22067759.0, "step": 4015, "train/ce_loss": 0.7446652054786682 }, { "epoch": 0.396974490804825, "step": 4015, "train/sim_loss": 0.0546875 }, { "epoch": 0.396974490804825, "step": 4015, "train/total_loss": 0.1291540265083313 }, { "entropy": 9.294900894165039, "epoch": 0.39707336365434054, "mean_token_accuracy": 0.6786155700683594, "num_tokens": 22073145.0, "step": 4016, "train/ce_loss": 0.5436044335365295 }, { "epoch": 0.39707336365434054, "step": 4016, "train/sim_loss": 0.03515625 }, { "epoch": 0.39707336365434054, "step": 4016, "train/total_loss": 0.08951669931411743 }, { "entropy": 9.026345252990723, "epoch": 0.397172236503856, "mean_token_accuracy": 0.7400644421577454, "num_tokens": 22078739.0, "step": 4017, "train/ce_loss": 1.1032159328460693 }, { "epoch": 0.397172236503856, "step": 4017, "train/sim_loss": 0.046875 }, { "epoch": 0.397172236503856, "step": 4017, "train/total_loss": 0.15719659626483917 }, { "entropy": 9.033388137817383, "epoch": 0.39727110935337157, "mean_token_accuracy": 0.7206982374191284, "num_tokens": 22084163.0, "step": 4018, "train/ce_loss": 0.9252844452857971 }, { "epoch": 0.39727110935337157, "step": 4018, "train/sim_loss": 0.078125 }, { "epoch": 0.39727110935337157, "step": 4018, "train/total_loss": 0.17065344750881195 }, { "entropy": 8.672633171081543, "epoch": 0.3973699822028871, "mean_token_accuracy": 0.7940630912780762, "num_tokens": 22089917.0, "step": 4019, "train/ce_loss": 0.5897098183631897 }, { "epoch": 0.3973699822028871, "step": 4019, "train/sim_loss": 0.12109375 }, { "epoch": 0.3973699822028871, "step": 4019, "train/total_loss": 0.18006473779678345 }, { "epoch": 0.3974688550524026, "grad_norm": 0.6401883959770203, "learning_rate": 9.00880185926915e-06, "loss": 0.1449, "step": 4020 }, { "entropy": 9.027091979980469, "epoch": 0.3974688550524026, "mean_token_accuracy": 0.7469437718391418, "num_tokens": 22095372.0, "step": 4020, "train/ce_loss": 0.6098704934120178 }, { "epoch": 0.3974688550524026, "step": 4020, "train/sim_loss": 0.07421875 }, { "epoch": 0.3974688550524026, "step": 4020, "train/total_loss": 0.13520580530166626 }, { "entropy": 9.181427001953125, "epoch": 0.39756772790191813, "mean_token_accuracy": 0.7615894079208374, "num_tokens": 22100761.0, "step": 4021, "train/ce_loss": 0.8020766377449036 }, { "epoch": 0.39756772790191813, "step": 4021, "train/sim_loss": 0.03125 }, { "epoch": 0.39756772790191813, "step": 4021, "train/total_loss": 0.11145766824483871 }, { "entropy": 8.967809677124023, "epoch": 0.3976666007514337, "mean_token_accuracy": 0.733619749546051, "num_tokens": 22106333.0, "step": 4022, "train/ce_loss": 0.6132085919380188 }, { "epoch": 0.3976666007514337, "step": 4022, "train/sim_loss": 0.046875 }, { "epoch": 0.3976666007514337, "step": 4022, "train/total_loss": 0.10819585621356964 }, { "entropy": 8.873201370239258, "epoch": 0.39776547360094916, "mean_token_accuracy": 0.7694915533065796, "num_tokens": 22111843.0, "step": 4023, "train/ce_loss": 0.5987311601638794 }, { "epoch": 0.39776547360094916, "step": 4023, "train/sim_loss": 0.08203125 }, { "epoch": 0.39776547360094916, "step": 4023, "train/total_loss": 0.14190436899662018 }, { "entropy": 9.019055366516113, "epoch": 0.3978643464504647, "mean_token_accuracy": 0.7503060102462769, "num_tokens": 22117310.0, "step": 4024, "train/ce_loss": 0.5501068830490112 }, { "epoch": 0.3978643464504647, "step": 4024, "train/sim_loss": 0.0234375 }, { "epoch": 0.3978643464504647, "step": 4024, "train/total_loss": 0.07844819128513336 }, { "entropy": 9.012530326843262, "epoch": 0.39796321929998024, "mean_token_accuracy": 0.6941489577293396, "num_tokens": 22122723.0, "step": 4025, "train/ce_loss": 0.6570414900779724 }, { "epoch": 0.39796321929998024, "step": 4025, "train/sim_loss": 0.078125 }, { "epoch": 0.39796321929998024, "step": 4025, "train/total_loss": 0.14382915198802948 }, { "entropy": 9.277970314025879, "epoch": 0.3980620921494957, "mean_token_accuracy": 0.6875, "num_tokens": 22128299.0, "step": 4026, "train/ce_loss": 2.001228094100952 }, { "epoch": 0.3980620921494957, "step": 4026, "train/sim_loss": 0.07421875 }, { "epoch": 0.3980620921494957, "step": 4026, "train/total_loss": 0.2743415832519531 }, { "entropy": 9.12454605102539, "epoch": 0.39816096499901127, "mean_token_accuracy": 0.7865771651268005, "num_tokens": 22133703.0, "step": 4027, "train/ce_loss": 0.5093667507171631 }, { "epoch": 0.39816096499901127, "step": 4027, "train/sim_loss": 0.0546875 }, { "epoch": 0.39816096499901127, "step": 4027, "train/total_loss": 0.10562417656183243 }, { "entropy": 8.710476875305176, "epoch": 0.3982598378485268, "mean_token_accuracy": 0.7786332964897156, "num_tokens": 22139411.0, "step": 4028, "train/ce_loss": 0.8279997110366821 }, { "epoch": 0.3982598378485268, "step": 4028, "train/sim_loss": 0.0859375 }, { "epoch": 0.3982598378485268, "step": 4028, "train/total_loss": 0.1687374711036682 }, { "entropy": 8.722845077514648, "epoch": 0.3983587106980423, "mean_token_accuracy": 0.6968325972557068, "num_tokens": 22145208.0, "step": 4029, "train/ce_loss": 1.4429744482040405 }, { "epoch": 0.3983587106980423, "step": 4029, "train/sim_loss": 0.0703125 }, { "epoch": 0.3983587106980423, "step": 4029, "train/total_loss": 0.21460995078086853 }, { "entropy": 9.231689453125, "epoch": 0.39845758354755784, "mean_token_accuracy": 0.764060378074646, "num_tokens": 22150517.0, "step": 4030, "train/ce_loss": 0.8042556643486023 }, { "epoch": 0.39845758354755784, "step": 4030, "train/sim_loss": 0.02734375 }, { "epoch": 0.39845758354755784, "step": 4030, "train/total_loss": 0.10776931792497635 }, { "entropy": 9.144826889038086, "epoch": 0.3985564563970734, "mean_token_accuracy": 0.7151514887809753, "num_tokens": 22155992.0, "step": 4031, "train/ce_loss": 1.1779955625534058 }, { "epoch": 0.3985564563970734, "step": 4031, "train/sim_loss": 0.11328125 }, { "epoch": 0.3985564563970734, "step": 4031, "train/total_loss": 0.2310808002948761 }, { "entropy": 8.91969108581543, "epoch": 0.39865532924658886, "mean_token_accuracy": 0.7386634945869446, "num_tokens": 22161523.0, "step": 4032, "train/ce_loss": 0.5465843081474304 }, { "epoch": 0.39865532924658886, "step": 4032, "train/sim_loss": 0.0390625 }, { "epoch": 0.39865532924658886, "step": 4032, "train/total_loss": 0.0937209278345108 }, { "entropy": 8.961355209350586, "epoch": 0.3987542020961044, "mean_token_accuracy": 0.7520092129707336, "num_tokens": 22167004.0, "step": 4033, "train/ce_loss": 1.0577248334884644 }, { "epoch": 0.3987542020961044, "step": 4033, "train/sim_loss": 0.08984375 }, { "epoch": 0.3987542020961044, "step": 4033, "train/total_loss": 0.1956162452697754 }, { "entropy": 9.193891525268555, "epoch": 0.39885307494561995, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 22172314.0, "step": 4034, "train/ce_loss": 0.8432437181472778 }, { "epoch": 0.39885307494561995, "step": 4034, "train/sim_loss": 0.046875 }, { "epoch": 0.39885307494561995, "step": 4034, "train/total_loss": 0.13119937479496002 }, { "entropy": 9.211508750915527, "epoch": 0.39895194779513543, "mean_token_accuracy": 0.7380660772323608, "num_tokens": 22177674.0, "step": 4035, "train/ce_loss": 1.4387216567993164 }, { "epoch": 0.39895194779513543, "step": 4035, "train/sim_loss": 0.0546875 }, { "epoch": 0.39895194779513543, "step": 4035, "train/total_loss": 0.19855967164039612 }, { "entropy": 8.876422882080078, "epoch": 0.39905082064465097, "mean_token_accuracy": 0.7352941036224365, "num_tokens": 22183213.0, "step": 4036, "train/ce_loss": 0.6954538226127625 }, { "epoch": 0.39905082064465097, "step": 4036, "train/sim_loss": 0.07421875 }, { "epoch": 0.39905082064465097, "step": 4036, "train/total_loss": 0.14376413822174072 }, { "entropy": 8.749289512634277, "epoch": 0.3991496934941665, "mean_token_accuracy": 0.7542799711227417, "num_tokens": 22188802.0, "step": 4037, "train/ce_loss": 0.8178299069404602 }, { "epoch": 0.3991496934941665, "step": 4037, "train/sim_loss": 0.046875 }, { "epoch": 0.3991496934941665, "step": 4037, "train/total_loss": 0.1286579966545105 }, { "entropy": 8.732597351074219, "epoch": 0.399248566343682, "mean_token_accuracy": 0.7529069781303406, "num_tokens": 22194468.0, "step": 4038, "train/ce_loss": 0.6482329964637756 }, { "epoch": 0.399248566343682, "step": 4038, "train/sim_loss": 0.0546875 }, { "epoch": 0.399248566343682, "step": 4038, "train/total_loss": 0.11951079964637756 }, { "entropy": 8.845829010009766, "epoch": 0.39934743919319754, "mean_token_accuracy": 0.7225244641304016, "num_tokens": 22199906.0, "step": 4039, "train/ce_loss": 0.5166327357292175 }, { "epoch": 0.39934743919319754, "step": 4039, "train/sim_loss": 0.0234375 }, { "epoch": 0.39934743919319754, "step": 4039, "train/total_loss": 0.07510077953338623 }, { "epoch": 0.3994463120427131, "grad_norm": 0.7234612107276917, "learning_rate": 9.0038569945112e-06, "loss": 0.1441, "step": 4040 }, { "entropy": 8.677694320678711, "epoch": 0.3994463120427131, "mean_token_accuracy": 0.7026476860046387, "num_tokens": 22205427.0, "step": 4040, "train/ce_loss": 1.420459270477295 }, { "epoch": 0.3994463120427131, "step": 4040, "train/sim_loss": 0.10546875 }, { "epoch": 0.3994463120427131, "step": 4040, "train/total_loss": 0.24751468002796173 }, { "entropy": 9.157602310180664, "epoch": 0.39954518489222857, "mean_token_accuracy": 0.7315130829811096, "num_tokens": 22210899.0, "step": 4041, "train/ce_loss": 0.45605841279029846 }, { "epoch": 0.39954518489222857, "step": 4041, "train/sim_loss": 0.03125 }, { "epoch": 0.39954518489222857, "step": 4041, "train/total_loss": 0.07685583829879761 }, { "entropy": 9.043392181396484, "epoch": 0.3996440577417441, "mean_token_accuracy": 0.8242574334144592, "num_tokens": 22216353.0, "step": 4042, "train/ce_loss": 0.5218964219093323 }, { "epoch": 0.3996440577417441, "step": 4042, "train/sim_loss": 0.0703125 }, { "epoch": 0.3996440577417441, "step": 4042, "train/total_loss": 0.1225021481513977 }, { "entropy": 8.788995742797852, "epoch": 0.39974293059125965, "mean_token_accuracy": 0.7542457580566406, "num_tokens": 22221922.0, "step": 4043, "train/ce_loss": 0.7332941889762878 }, { "epoch": 0.39974293059125965, "step": 4043, "train/sim_loss": 0.0546875 }, { "epoch": 0.39974293059125965, "step": 4043, "train/total_loss": 0.12801691889762878 }, { "entropy": 8.693917274475098, "epoch": 0.3998418034407752, "mean_token_accuracy": 0.71875, "num_tokens": 22227493.0, "step": 4044, "train/ce_loss": 0.9059188961982727 }, { "epoch": 0.3998418034407752, "step": 4044, "train/sim_loss": 0.02734375 }, { "epoch": 0.3998418034407752, "step": 4044, "train/total_loss": 0.11793564260005951 }, { "entropy": 8.972824096679688, "epoch": 0.3999406762902907, "mean_token_accuracy": 0.6994680762290955, "num_tokens": 22232840.0, "step": 4045, "train/ce_loss": 1.1064380407333374 }, { "epoch": 0.3999406762902907, "step": 4045, "train/sim_loss": 0.046875 }, { "epoch": 0.3999406762902907, "step": 4045, "train/total_loss": 0.15751880407333374 }, { "entropy": 8.86695671081543, "epoch": 0.4000395491398062, "mean_token_accuracy": 0.73380446434021, "num_tokens": 22238259.0, "step": 4046, "train/ce_loss": 0.6365875601768494 }, { "epoch": 0.4000395491398062, "step": 4046, "train/sim_loss": 0.0625 }, { "epoch": 0.4000395491398062, "step": 4046, "train/total_loss": 0.12615875899791718 }, { "entropy": 8.694969177246094, "epoch": 0.40013842198932176, "mean_token_accuracy": 0.7819047570228577, "num_tokens": 22243921.0, "step": 4047, "train/ce_loss": 0.8229476809501648 }, { "epoch": 0.40013842198932176, "step": 4047, "train/sim_loss": 0.046875 }, { "epoch": 0.40013842198932176, "step": 4047, "train/total_loss": 0.129169762134552 }, { "entropy": 8.742379188537598, "epoch": 0.40023729483883724, "mean_token_accuracy": 0.8018292784690857, "num_tokens": 22249535.0, "step": 4048, "train/ce_loss": 0.5149574875831604 }, { "epoch": 0.40023729483883724, "step": 4048, "train/sim_loss": 0.0625 }, { "epoch": 0.40023729483883724, "step": 4048, "train/total_loss": 0.1139957457780838 }, { "entropy": 8.895851135253906, "epoch": 0.4003361676883528, "mean_token_accuracy": 0.7331154942512512, "num_tokens": 22255006.0, "step": 4049, "train/ce_loss": 1.322335124015808 }, { "epoch": 0.4003361676883528, "step": 4049, "train/sim_loss": 0.078125 }, { "epoch": 0.4003361676883528, "step": 4049, "train/total_loss": 0.21035851538181305 }, { "entropy": 9.202339172363281, "epoch": 0.4004350405378683, "mean_token_accuracy": 0.7537619471549988, "num_tokens": 22260362.0, "step": 4050, "train/ce_loss": 0.6239035129547119 }, { "epoch": 0.4004350405378683, "step": 4050, "train/sim_loss": 0.03125 }, { "epoch": 0.4004350405378683, "step": 4050, "train/total_loss": 0.09364035725593567 }, { "entropy": 9.023335456848145, "epoch": 0.4005339133873838, "mean_token_accuracy": 0.7837541103363037, "num_tokens": 22265806.0, "step": 4051, "train/ce_loss": 0.4239130914211273 }, { "epoch": 0.4005339133873838, "step": 4051, "train/sim_loss": 0.01953125 }, { "epoch": 0.4005339133873838, "step": 4051, "train/total_loss": 0.06192256137728691 }, { "entropy": 8.909139633178711, "epoch": 0.40063278623689935, "mean_token_accuracy": 0.7718918919563293, "num_tokens": 22271437.0, "step": 4052, "train/ce_loss": 0.5500393509864807 }, { "epoch": 0.40063278623689935, "step": 4052, "train/sim_loss": 0.04296875 }, { "epoch": 0.40063278623689935, "step": 4052, "train/total_loss": 0.09797269105911255 }, { "entropy": 8.879587173461914, "epoch": 0.4007316590864149, "mean_token_accuracy": 0.7610887289047241, "num_tokens": 22277089.0, "step": 4053, "train/ce_loss": 0.9933063387870789 }, { "epoch": 0.4007316590864149, "step": 4053, "train/sim_loss": 0.07421875 }, { "epoch": 0.4007316590864149, "step": 4053, "train/total_loss": 0.17354938387870789 }, { "entropy": 9.168947219848633, "epoch": 0.4008305319359304, "mean_token_accuracy": 0.7913278937339783, "num_tokens": 22282452.0, "step": 4054, "train/ce_loss": 0.7340458631515503 }, { "epoch": 0.4008305319359304, "step": 4054, "train/sim_loss": 0.0390625 }, { "epoch": 0.4008305319359304, "step": 4054, "train/total_loss": 0.11246708780527115 }, { "entropy": 8.716483116149902, "epoch": 0.4009294047854459, "mean_token_accuracy": 0.7197664976119995, "num_tokens": 22288410.0, "step": 4055, "train/ce_loss": 0.9856875538825989 }, { "epoch": 0.4009294047854459, "step": 4055, "train/sim_loss": 0.08203125 }, { "epoch": 0.4009294047854459, "step": 4055, "train/total_loss": 0.18060001730918884 }, { "entropy": 9.069129943847656, "epoch": 0.40102827763496146, "mean_token_accuracy": 0.7638888955116272, "num_tokens": 22293768.0, "step": 4056, "train/ce_loss": 0.6239318251609802 }, { "epoch": 0.40102827763496146, "step": 4056, "train/sim_loss": 0.03125 }, { "epoch": 0.40102827763496146, "step": 4056, "train/total_loss": 0.0936431884765625 }, { "entropy": 9.10161018371582, "epoch": 0.40112715048447695, "mean_token_accuracy": 0.7513368725776672, "num_tokens": 22299193.0, "step": 4057, "train/ce_loss": 0.8542079925537109 }, { "epoch": 0.40112715048447695, "step": 4057, "train/sim_loss": 0.0859375 }, { "epoch": 0.40112715048447695, "step": 4057, "train/total_loss": 0.17135830223560333 }, { "entropy": 8.970346450805664, "epoch": 0.4012260233339925, "mean_token_accuracy": 0.7352941036224365, "num_tokens": 22304699.0, "step": 4058, "train/ce_loss": 0.6941327452659607 }, { "epoch": 0.4012260233339925, "step": 4058, "train/sim_loss": 0.0546875 }, { "epoch": 0.4012260233339925, "step": 4058, "train/total_loss": 0.12410077452659607 }, { "entropy": 9.221994400024414, "epoch": 0.40132489618350803, "mean_token_accuracy": 0.7732207775115967, "num_tokens": 22310121.0, "step": 4059, "train/ce_loss": 0.8129037618637085 }, { "epoch": 0.40132489618350803, "step": 4059, "train/sim_loss": 0.06640625 }, { "epoch": 0.40132489618350803, "step": 4059, "train/total_loss": 0.1476966291666031 }, { "epoch": 0.4014237690330235, "grad_norm": 0.7485663294792175, "learning_rate": 8.998912129753253e-06, "loss": 0.1369, "step": 4060 }, { "entropy": 8.594341278076172, "epoch": 0.4014237690330235, "mean_token_accuracy": 0.7018150091171265, "num_tokens": 22315831.0, "step": 4060, "train/ce_loss": 0.3827391266822815 }, { "epoch": 0.4014237690330235, "step": 4060, "train/sim_loss": 0.0859375 }, { "epoch": 0.4014237690330235, "step": 4060, "train/total_loss": 0.12421141564846039 }, { "entropy": 9.273946762084961, "epoch": 0.40152264188253906, "mean_token_accuracy": 0.760351300239563, "num_tokens": 22321244.0, "step": 4061, "train/ce_loss": 0.6724047660827637 }, { "epoch": 0.40152264188253906, "step": 4061, "train/sim_loss": 0.125 }, { "epoch": 0.40152264188253906, "step": 4061, "train/total_loss": 0.19224047660827637 }, { "entropy": 8.790567398071289, "epoch": 0.4016215147320546, "mean_token_accuracy": 0.8077314496040344, "num_tokens": 22326860.0, "step": 4062, "train/ce_loss": 0.6060258150100708 }, { "epoch": 0.4016215147320546, "step": 4062, "train/sim_loss": 0.03515625 }, { "epoch": 0.4016215147320546, "step": 4062, "train/total_loss": 0.0957588329911232 }, { "entropy": 9.24850082397461, "epoch": 0.4017203875815701, "mean_token_accuracy": 0.7575376629829407, "num_tokens": 22332286.0, "step": 4063, "train/ce_loss": 0.9345538020133972 }, { "epoch": 0.4017203875815701, "step": 4063, "train/sim_loss": 0.0625 }, { "epoch": 0.4017203875815701, "step": 4063, "train/total_loss": 0.15595537424087524 }, { "entropy": 9.208782196044922, "epoch": 0.4018192604310856, "mean_token_accuracy": 0.7316455841064453, "num_tokens": 22337741.0, "step": 4064, "train/ce_loss": 1.4008188247680664 }, { "epoch": 0.4018192604310856, "step": 4064, "train/sim_loss": 0.06640625 }, { "epoch": 0.4018192604310856, "step": 4064, "train/total_loss": 0.20648813247680664 }, { "entropy": 8.76080322265625, "epoch": 0.40191813328060116, "mean_token_accuracy": 0.6841541528701782, "num_tokens": 22343149.0, "step": 4065, "train/ce_loss": 0.5999292135238647 }, { "epoch": 0.40191813328060116, "step": 4065, "train/sim_loss": 0.046875 }, { "epoch": 0.40191813328060116, "step": 4065, "train/total_loss": 0.10686792433261871 }, { "entropy": 9.024477005004883, "epoch": 0.40201700613011665, "mean_token_accuracy": 0.711561381816864, "num_tokens": 22348704.0, "step": 4066, "train/ce_loss": 0.6147543787956238 }, { "epoch": 0.40201700613011665, "step": 4066, "train/sim_loss": 0.06640625 }, { "epoch": 0.40201700613011665, "step": 4066, "train/total_loss": 0.12788169085979462 }, { "entropy": 9.134902000427246, "epoch": 0.4021158789796322, "mean_token_accuracy": 0.7739018201828003, "num_tokens": 22354128.0, "step": 4067, "train/ce_loss": 0.8797760605812073 }, { "epoch": 0.4021158789796322, "step": 4067, "train/sim_loss": 0.08203125 }, { "epoch": 0.4021158789796322, "step": 4067, "train/total_loss": 0.17000886797904968 }, { "entropy": 8.834650039672852, "epoch": 0.40221475182914773, "mean_token_accuracy": 0.6659877896308899, "num_tokens": 22359691.0, "step": 4068, "train/ce_loss": 0.48245295882225037 }, { "epoch": 0.40221475182914773, "step": 4068, "train/sim_loss": 0.05078125 }, { "epoch": 0.40221475182914773, "step": 4068, "train/total_loss": 0.09902654588222504 }, { "entropy": 8.85243034362793, "epoch": 0.4023136246786632, "mean_token_accuracy": 0.7269663214683533, "num_tokens": 22365200.0, "step": 4069, "train/ce_loss": 0.6164126992225647 }, { "epoch": 0.4023136246786632, "step": 4069, "train/sim_loss": 0.0625 }, { "epoch": 0.4023136246786632, "step": 4069, "train/total_loss": 0.12414127588272095 }, { "entropy": 9.135835647583008, "epoch": 0.40241249752817876, "mean_token_accuracy": 0.7380645275115967, "num_tokens": 22370647.0, "step": 4070, "train/ce_loss": 0.9173439741134644 }, { "epoch": 0.40241249752817876, "step": 4070, "train/sim_loss": 0.0625 }, { "epoch": 0.40241249752817876, "step": 4070, "train/total_loss": 0.1542344093322754 }, { "entropy": 8.778614044189453, "epoch": 0.4025113703776943, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 22376191.0, "step": 4071, "train/ce_loss": 0.5573059916496277 }, { "epoch": 0.4025113703776943, "step": 4071, "train/sim_loss": 0.03125 }, { "epoch": 0.4025113703776943, "step": 4071, "train/total_loss": 0.08698059618473053 }, { "entropy": 9.130858421325684, "epoch": 0.4026102432272098, "mean_token_accuracy": 0.72951740026474, "num_tokens": 22381743.0, "step": 4072, "train/ce_loss": 0.8517532348632812 }, { "epoch": 0.4026102432272098, "step": 4072, "train/sim_loss": 0.11328125 }, { "epoch": 0.4026102432272098, "step": 4072, "train/total_loss": 0.19845658540725708 }, { "entropy": 8.752975463867188, "epoch": 0.4027091160767253, "mean_token_accuracy": 0.6600345969200134, "num_tokens": 22387472.0, "step": 4073, "train/ce_loss": 2.3919003009796143 }, { "epoch": 0.4027091160767253, "step": 4073, "train/sim_loss": 0.04296875 }, { "epoch": 0.4027091160767253, "step": 4073, "train/total_loss": 0.2821587920188904 }, { "entropy": 8.660243034362793, "epoch": 0.40280798892624087, "mean_token_accuracy": 0.7145649790763855, "num_tokens": 22393131.0, "step": 4074, "train/ce_loss": 1.3733108043670654 }, { "epoch": 0.40280798892624087, "step": 4074, "train/sim_loss": 0.09765625 }, { "epoch": 0.40280798892624087, "step": 4074, "train/total_loss": 0.23498733341693878 }, { "entropy": 9.012033462524414, "epoch": 0.40290686177575635, "mean_token_accuracy": 0.7423076629638672, "num_tokens": 22398787.0, "step": 4075, "train/ce_loss": 0.4553801119327545 }, { "epoch": 0.40290686177575635, "step": 4075, "train/sim_loss": 0.0234375 }, { "epoch": 0.40290686177575635, "step": 4075, "train/total_loss": 0.06897550821304321 }, { "entropy": 8.828946113586426, "epoch": 0.4030057346252719, "mean_token_accuracy": 0.7361660003662109, "num_tokens": 22404401.0, "step": 4076, "train/ce_loss": 0.6154571175575256 }, { "epoch": 0.4030057346252719, "step": 4076, "train/sim_loss": 0.03125 }, { "epoch": 0.4030057346252719, "step": 4076, "train/total_loss": 0.0927957147359848 }, { "entropy": 9.245216369628906, "epoch": 0.40310460747478744, "mean_token_accuracy": 0.7472826242446899, "num_tokens": 22409799.0, "step": 4077, "train/ce_loss": 0.663863480091095 }, { "epoch": 0.40310460747478744, "step": 4077, "train/sim_loss": 0.03515625 }, { "epoch": 0.40310460747478744, "step": 4077, "train/total_loss": 0.10154259949922562 }, { "entropy": 8.982665061950684, "epoch": 0.4032034803243029, "mean_token_accuracy": 0.7478474974632263, "num_tokens": 22415220.0, "step": 4078, "train/ce_loss": 1.1384378671646118 }, { "epoch": 0.4032034803243029, "step": 4078, "train/sim_loss": 0.109375 }, { "epoch": 0.4032034803243029, "step": 4078, "train/total_loss": 0.22321879863739014 }, { "entropy": 9.053544044494629, "epoch": 0.40330235317381846, "mean_token_accuracy": 0.7942157983779907, "num_tokens": 22420718.0, "step": 4079, "train/ce_loss": 0.6571411490440369 }, { "epoch": 0.40330235317381846, "step": 4079, "train/sim_loss": 0.0625 }, { "epoch": 0.40330235317381846, "step": 4079, "train/total_loss": 0.12821412086486816 }, { "epoch": 0.403401226023334, "grad_norm": 0.6118993759155273, "learning_rate": 8.993967264995303e-06, "loss": 0.1473, "step": 4080 }, { "entropy": 9.189098358154297, "epoch": 0.403401226023334, "mean_token_accuracy": 0.8233731985092163, "num_tokens": 22426105.0, "step": 4080, "train/ce_loss": 0.4570057988166809 }, { "epoch": 0.403401226023334, "step": 4080, "train/sim_loss": 0.02734375 }, { "epoch": 0.403401226023334, "step": 4080, "train/total_loss": 0.07304432988166809 }, { "entropy": 8.968326568603516, "epoch": 0.4035000988728495, "mean_token_accuracy": 0.7185929417610168, "num_tokens": 22431548.0, "step": 4081, "train/ce_loss": 0.7980078458786011 }, { "epoch": 0.4035000988728495, "step": 4081, "train/sim_loss": 0.0546875 }, { "epoch": 0.4035000988728495, "step": 4081, "train/total_loss": 0.1344882845878601 }, { "entropy": 8.746297836303711, "epoch": 0.40359897172236503, "mean_token_accuracy": 0.7233849167823792, "num_tokens": 22437215.0, "step": 4082, "train/ce_loss": 0.5097014307975769 }, { "epoch": 0.40359897172236503, "step": 4082, "train/sim_loss": 0.03125 }, { "epoch": 0.40359897172236503, "step": 4082, "train/total_loss": 0.08222014456987381 }, { "entropy": 9.186837196350098, "epoch": 0.40369784457188057, "mean_token_accuracy": 0.7798377871513367, "num_tokens": 22442697.0, "step": 4083, "train/ce_loss": 0.6324573755264282 }, { "epoch": 0.40369784457188057, "step": 4083, "train/sim_loss": 0.0546875 }, { "epoch": 0.40369784457188057, "step": 4083, "train/total_loss": 0.1179332360625267 }, { "entropy": 8.645106315612793, "epoch": 0.4037967174213961, "mean_token_accuracy": 0.7817418575286865, "num_tokens": 22448300.0, "step": 4084, "train/ce_loss": 0.620408296585083 }, { "epoch": 0.4037967174213961, "step": 4084, "train/sim_loss": 0.0546875 }, { "epoch": 0.4037967174213961, "step": 4084, "train/total_loss": 0.11672833561897278 }, { "entropy": 9.034409523010254, "epoch": 0.4038955902709116, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 22453799.0, "step": 4085, "train/ce_loss": 0.7014113068580627 }, { "epoch": 0.4038955902709116, "step": 4085, "train/sim_loss": 0.078125 }, { "epoch": 0.4038955902709116, "step": 4085, "train/total_loss": 0.14826613664627075 }, { "entropy": 8.833980560302734, "epoch": 0.40399446312042714, "mean_token_accuracy": 0.7325102686882019, "num_tokens": 22459381.0, "step": 4086, "train/ce_loss": 0.8660094738006592 }, { "epoch": 0.40399446312042714, "step": 4086, "train/sim_loss": 0.046875 }, { "epoch": 0.40399446312042714, "step": 4086, "train/total_loss": 0.13347595930099487 }, { "entropy": 9.150894165039062, "epoch": 0.4040933359699427, "mean_token_accuracy": 0.7976331114768982, "num_tokens": 22464844.0, "step": 4087, "train/ce_loss": 0.3878864347934723 }, { "epoch": 0.4040933359699427, "step": 4087, "train/sim_loss": 0.0234375 }, { "epoch": 0.4040933359699427, "step": 4087, "train/total_loss": 0.06222614273428917 }, { "entropy": 8.77254867553711, "epoch": 0.40419220881945817, "mean_token_accuracy": 0.7207392454147339, "num_tokens": 22470359.0, "step": 4088, "train/ce_loss": 0.6788516044616699 }, { "epoch": 0.40419220881945817, "step": 4088, "train/sim_loss": 0.03515625 }, { "epoch": 0.40419220881945817, "step": 4088, "train/total_loss": 0.10304141044616699 }, { "entropy": 8.85892105102539, "epoch": 0.4042910816689737, "mean_token_accuracy": 0.7604060769081116, "num_tokens": 22476023.0, "step": 4089, "train/ce_loss": 0.8189828395843506 }, { "epoch": 0.4042910816689737, "step": 4089, "train/sim_loss": 0.0625 }, { "epoch": 0.4042910816689737, "step": 4089, "train/total_loss": 0.1443982869386673 }, { "entropy": 8.873918533325195, "epoch": 0.40438995451848925, "mean_token_accuracy": 0.7719836235046387, "num_tokens": 22481640.0, "step": 4090, "train/ce_loss": 0.4682827889919281 }, { "epoch": 0.40438995451848925, "step": 4090, "train/sim_loss": 0.015625 }, { "epoch": 0.40438995451848925, "step": 4090, "train/total_loss": 0.06245328113436699 }, { "entropy": 8.700141906738281, "epoch": 0.40448882736800473, "mean_token_accuracy": 0.7379385828971863, "num_tokens": 22487175.0, "step": 4091, "train/ce_loss": 0.6015080809593201 }, { "epoch": 0.40448882736800473, "step": 4091, "train/sim_loss": 0.0234375 }, { "epoch": 0.40448882736800473, "step": 4091, "train/total_loss": 0.08358830958604813 }, { "entropy": 8.584639549255371, "epoch": 0.4045877002175203, "mean_token_accuracy": 0.6707021594047546, "num_tokens": 22492976.0, "step": 4092, "train/ce_loss": 1.8881745338439941 }, { "epoch": 0.4045877002175203, "step": 4092, "train/sim_loss": 0.06640625 }, { "epoch": 0.4045877002175203, "step": 4092, "train/total_loss": 0.25522369146347046 }, { "entropy": 9.195140838623047, "epoch": 0.4046865730670358, "mean_token_accuracy": 0.7271468043327332, "num_tokens": 22498345.0, "step": 4093, "train/ce_loss": 0.3204111158847809 }, { "epoch": 0.4046865730670358, "step": 4093, "train/sim_loss": 0.0625 }, { "epoch": 0.4046865730670358, "step": 4093, "train/total_loss": 0.09454111754894257 }, { "entropy": 8.231011390686035, "epoch": 0.4047854459165513, "mean_token_accuracy": 0.7024911046028137, "num_tokens": 22504260.0, "step": 4094, "train/ce_loss": 1.3575011491775513 }, { "epoch": 0.4047854459165513, "step": 4094, "train/sim_loss": 0.0625 }, { "epoch": 0.4047854459165513, "step": 4094, "train/total_loss": 0.19825011491775513 }, { "entropy": 9.145477294921875, "epoch": 0.40488431876606684, "mean_token_accuracy": 0.74609375, "num_tokens": 22509650.0, "step": 4095, "train/ce_loss": 0.6746314167976379 }, { "epoch": 0.40488431876606684, "step": 4095, "train/sim_loss": 0.08984375 }, { "epoch": 0.40488431876606684, "step": 4095, "train/total_loss": 0.15730689465999603 }, { "entropy": 9.342007637023926, "epoch": 0.4049831916155824, "mean_token_accuracy": 0.7476383447647095, "num_tokens": 22514951.0, "step": 4096, "train/ce_loss": 0.7805969715118408 }, { "epoch": 0.4049831916155824, "step": 4096, "train/sim_loss": 0.10546875 }, { "epoch": 0.4049831916155824, "step": 4096, "train/total_loss": 0.18352845311164856 }, { "entropy": 9.004839897155762, "epoch": 0.40508206446509787, "mean_token_accuracy": 0.7399309277534485, "num_tokens": 22520401.0, "step": 4097, "train/ce_loss": 0.45559272170066833 }, { "epoch": 0.40508206446509787, "step": 4097, "train/sim_loss": 0.03125 }, { "epoch": 0.40508206446509787, "step": 4097, "train/total_loss": 0.07680927217006683 }, { "entropy": 8.679258346557617, "epoch": 0.4051809373146134, "mean_token_accuracy": 0.7019867300987244, "num_tokens": 22526022.0, "step": 4098, "train/ce_loss": 1.7803421020507812 }, { "epoch": 0.4051809373146134, "step": 4098, "train/sim_loss": 0.0703125 }, { "epoch": 0.4051809373146134, "step": 4098, "train/total_loss": 0.2483467161655426 }, { "entropy": 9.161215782165527, "epoch": 0.40527981016412895, "mean_token_accuracy": 0.7402597665786743, "num_tokens": 22531255.0, "step": 4099, "train/ce_loss": 0.5631232261657715 }, { "epoch": 0.40527981016412895, "step": 4099, "train/sim_loss": 0.0546875 }, { "epoch": 0.40527981016412895, "step": 4099, "train/total_loss": 0.11099982261657715 }, { "epoch": 0.40537868301364444, "grad_norm": 0.7420522570610046, "learning_rate": 8.989022400237354e-06, "loss": 0.1422, "step": 4100 }, { "entropy": 8.74085807800293, "epoch": 0.40537868301364444, "mean_token_accuracy": 0.7384780049324036, "num_tokens": 22536788.0, "step": 4100, "train/ce_loss": 0.9067572355270386 }, { "epoch": 0.40537868301364444, "step": 4100, "train/sim_loss": 0.0546875 }, { "epoch": 0.40537868301364444, "step": 4100, "train/total_loss": 0.1453632265329361 }, { "entropy": 8.734097480773926, "epoch": 0.40547755586316, "mean_token_accuracy": 0.7346072196960449, "num_tokens": 22542306.0, "step": 4101, "train/ce_loss": 0.45541709661483765 }, { "epoch": 0.40547755586316, "step": 4101, "train/sim_loss": 0.02734375 }, { "epoch": 0.40547755586316, "step": 4101, "train/total_loss": 0.07288546115159988 }, { "entropy": 8.861083030700684, "epoch": 0.4055764287126755, "mean_token_accuracy": 0.7248908281326294, "num_tokens": 22547892.0, "step": 4102, "train/ce_loss": 1.140023946762085 }, { "epoch": 0.4055764287126755, "step": 4102, "train/sim_loss": 0.09375 }, { "epoch": 0.4055764287126755, "step": 4102, "train/total_loss": 0.20775240659713745 }, { "entropy": 8.812078475952148, "epoch": 0.405675301562191, "mean_token_accuracy": 0.7223264575004578, "num_tokens": 22553523.0, "step": 4103, "train/ce_loss": 1.872029423713684 }, { "epoch": 0.405675301562191, "step": 4103, "train/sim_loss": 0.05078125 }, { "epoch": 0.405675301562191, "step": 4103, "train/total_loss": 0.23798419535160065 }, { "entropy": 9.301538467407227, "epoch": 0.40577417441170655, "mean_token_accuracy": 0.7763713002204895, "num_tokens": 22558838.0, "step": 4104, "train/ce_loss": 0.5490849614143372 }, { "epoch": 0.40577417441170655, "step": 4104, "train/sim_loss": 0.06640625 }, { "epoch": 0.40577417441170655, "step": 4104, "train/total_loss": 0.12131474912166595 }, { "entropy": 8.851760864257812, "epoch": 0.4058730472612221, "mean_token_accuracy": 0.7246511578559875, "num_tokens": 22564478.0, "step": 4105, "train/ce_loss": 0.3667882978916168 }, { "epoch": 0.4058730472612221, "step": 4105, "train/sim_loss": 0.0546875 }, { "epoch": 0.4058730472612221, "step": 4105, "train/total_loss": 0.09136633574962616 }, { "entropy": 8.754572868347168, "epoch": 0.40597192011073757, "mean_token_accuracy": 0.7270811200141907, "num_tokens": 22570110.0, "step": 4106, "train/ce_loss": 0.6266009211540222 }, { "epoch": 0.40597192011073757, "step": 4106, "train/sim_loss": 0.046875 }, { "epoch": 0.40597192011073757, "step": 4106, "train/total_loss": 0.1095350906252861 }, { "entropy": 8.785636901855469, "epoch": 0.4060707929602531, "mean_token_accuracy": 0.7045215368270874, "num_tokens": 22575579.0, "step": 4107, "train/ce_loss": 0.963702380657196 }, { "epoch": 0.4060707929602531, "step": 4107, "train/sim_loss": 0.06640625 }, { "epoch": 0.4060707929602531, "step": 4107, "train/total_loss": 0.16277649998664856 }, { "entropy": 9.084461212158203, "epoch": 0.40616966580976865, "mean_token_accuracy": 0.707317054271698, "num_tokens": 22581030.0, "step": 4108, "train/ce_loss": 0.9742991328239441 }, { "epoch": 0.40616966580976865, "step": 4108, "train/sim_loss": 0.0390625 }, { "epoch": 0.40616966580976865, "step": 4108, "train/total_loss": 0.13649241626262665 }, { "entropy": 8.861637115478516, "epoch": 0.40626853865928414, "mean_token_accuracy": 0.774944543838501, "num_tokens": 22586515.0, "step": 4109, "train/ce_loss": 0.5479894876480103 }, { "epoch": 0.40626853865928414, "step": 4109, "train/sim_loss": 0.04296875 }, { "epoch": 0.40626853865928414, "step": 4109, "train/total_loss": 0.09776769578456879 }, { "entropy": 9.21090316772461, "epoch": 0.4063674115087997, "mean_token_accuracy": 0.7551766037940979, "num_tokens": 22591890.0, "step": 4110, "train/ce_loss": 0.47950664162635803 }, { "epoch": 0.4063674115087997, "step": 4110, "train/sim_loss": 0.06640625 }, { "epoch": 0.4063674115087997, "step": 4110, "train/total_loss": 0.11435692012310028 }, { "entropy": 8.751771926879883, "epoch": 0.4064662843583152, "mean_token_accuracy": 0.7303609251976013, "num_tokens": 22597488.0, "step": 4111, "train/ce_loss": 0.6461188197135925 }, { "epoch": 0.4064662843583152, "step": 4111, "train/sim_loss": 0.046875 }, { "epoch": 0.4064662843583152, "step": 4111, "train/total_loss": 0.11148688197135925 }, { "entropy": 9.131616592407227, "epoch": 0.4065651572078307, "mean_token_accuracy": 0.7593712210655212, "num_tokens": 22602947.0, "step": 4112, "train/ce_loss": 0.7537021636962891 }, { "epoch": 0.4065651572078307, "step": 4112, "train/sim_loss": 0.0390625 }, { "epoch": 0.4065651572078307, "step": 4112, "train/total_loss": 0.11443271487951279 }, { "entropy": 9.040388107299805, "epoch": 0.40666403005734625, "mean_token_accuracy": 0.7165071964263916, "num_tokens": 22608422.0, "step": 4113, "train/ce_loss": 0.9539766311645508 }, { "epoch": 0.40666403005734625, "step": 4113, "train/sim_loss": 0.08203125 }, { "epoch": 0.40666403005734625, "step": 4113, "train/total_loss": 0.17742891609668732 }, { "entropy": 8.987548828125, "epoch": 0.4067629029068618, "mean_token_accuracy": 0.7887755036354065, "num_tokens": 22613987.0, "step": 4114, "train/ce_loss": 0.6036435961723328 }, { "epoch": 0.4067629029068618, "step": 4114, "train/sim_loss": 0.01953125 }, { "epoch": 0.4067629029068618, "step": 4114, "train/total_loss": 0.07989561557769775 }, { "entropy": 8.906057357788086, "epoch": 0.4068617757563773, "mean_token_accuracy": 0.7662337422370911, "num_tokens": 22619588.0, "step": 4115, "train/ce_loss": 0.5786186456680298 }, { "epoch": 0.4068617757563773, "step": 4115, "train/sim_loss": 0.11328125 }, { "epoch": 0.4068617757563773, "step": 4115, "train/total_loss": 0.17114311456680298 }, { "entropy": 8.378900527954102, "epoch": 0.4069606486058928, "mean_token_accuracy": 0.7211464047431946, "num_tokens": 22625478.0, "step": 4116, "train/ce_loss": 1.367689609527588 }, { "epoch": 0.4069606486058928, "step": 4116, "train/sim_loss": 0.06640625 }, { "epoch": 0.4069606486058928, "step": 4116, "train/total_loss": 0.20317521691322327 }, { "entropy": 8.850500106811523, "epoch": 0.40705952145540836, "mean_token_accuracy": 0.7813578844070435, "num_tokens": 22631007.0, "step": 4117, "train/ce_loss": 0.7732144594192505 }, { "epoch": 0.40705952145540836, "step": 4117, "train/sim_loss": 0.0625 }, { "epoch": 0.40705952145540836, "step": 4117, "train/total_loss": 0.13982143998146057 }, { "entropy": 9.038641929626465, "epoch": 0.40715839430492384, "mean_token_accuracy": 0.7363057136535645, "num_tokens": 22636471.0, "step": 4118, "train/ce_loss": 1.0279085636138916 }, { "epoch": 0.40715839430492384, "step": 4118, "train/sim_loss": 0.03125 }, { "epoch": 0.40715839430492384, "step": 4118, "train/total_loss": 0.13404086232185364 }, { "entropy": 8.729204177856445, "epoch": 0.4072572671544394, "mean_token_accuracy": 0.7810150384902954, "num_tokens": 22642160.0, "step": 4119, "train/ce_loss": 0.3434404134750366 }, { "epoch": 0.4072572671544394, "step": 4119, "train/sim_loss": 0.01953125 }, { "epoch": 0.4072572671544394, "step": 4119, "train/total_loss": 0.05387529358267784 }, { "epoch": 0.4073561400039549, "grad_norm": 0.5914965867996216, "learning_rate": 8.984077535479406e-06, "loss": 0.1356, "step": 4120 }, { "entropy": 9.235552787780762, "epoch": 0.4073561400039549, "mean_token_accuracy": 0.7968217730522156, "num_tokens": 22647565.0, "step": 4120, "train/ce_loss": 0.5961018204689026 }, { "epoch": 0.4073561400039549, "step": 4120, "train/sim_loss": 0.015625 }, { "epoch": 0.4073561400039549, "step": 4120, "train/total_loss": 0.07523518800735474 }, { "entropy": 8.91010570526123, "epoch": 0.4074550128534704, "mean_token_accuracy": 0.705636739730835, "num_tokens": 22653317.0, "step": 4121, "train/ce_loss": 0.7705829739570618 }, { "epoch": 0.4074550128534704, "step": 4121, "train/sim_loss": 0.04296875 }, { "epoch": 0.4074550128534704, "step": 4121, "train/total_loss": 0.12002705037593842 }, { "entropy": 9.295656204223633, "epoch": 0.40755388570298595, "mean_token_accuracy": 0.7823834419250488, "num_tokens": 22658565.0, "step": 4122, "train/ce_loss": 0.6719939708709717 }, { "epoch": 0.40755388570298595, "step": 4122, "train/sim_loss": 0.015625 }, { "epoch": 0.40755388570298595, "step": 4122, "train/total_loss": 0.08282440155744553 }, { "entropy": 8.935005187988281, "epoch": 0.4076527585525015, "mean_token_accuracy": 0.7345013618469238, "num_tokens": 22663938.0, "step": 4123, "train/ce_loss": 1.7523157596588135 }, { "epoch": 0.4076527585525015, "step": 4123, "train/sim_loss": 0.078125 }, { "epoch": 0.4076527585525015, "step": 4123, "train/total_loss": 0.25335657596588135 }, { "entropy": 9.214059829711914, "epoch": 0.407751631402017, "mean_token_accuracy": 0.7171717286109924, "num_tokens": 22669413.0, "step": 4124, "train/ce_loss": 0.4811275899410248 }, { "epoch": 0.407751631402017, "step": 4124, "train/sim_loss": 0.05078125 }, { "epoch": 0.407751631402017, "step": 4124, "train/total_loss": 0.09889401495456696 }, { "entropy": 8.803160667419434, "epoch": 0.4078505042515325, "mean_token_accuracy": 0.7416020631790161, "num_tokens": 22674783.0, "step": 4125, "train/ce_loss": 0.8443925380706787 }, { "epoch": 0.4078505042515325, "step": 4125, "train/sim_loss": 0.0546875 }, { "epoch": 0.4078505042515325, "step": 4125, "train/total_loss": 0.1391267478466034 }, { "entropy": 8.975228309631348, "epoch": 0.40794937710104806, "mean_token_accuracy": 0.7889022827148438, "num_tokens": 22680266.0, "step": 4126, "train/ce_loss": 0.8338801264762878 }, { "epoch": 0.40794937710104806, "step": 4126, "train/sim_loss": 0.0703125 }, { "epoch": 0.40794937710104806, "step": 4126, "train/total_loss": 0.15370051562786102 }, { "entropy": 8.974936485290527, "epoch": 0.4080482499505636, "mean_token_accuracy": 0.7830303311347961, "num_tokens": 22685717.0, "step": 4127, "train/ce_loss": 0.6454037427902222 }, { "epoch": 0.4080482499505636, "step": 4127, "train/sim_loss": 0.12109375 }, { "epoch": 0.4080482499505636, "step": 4127, "train/total_loss": 0.18563413619995117 }, { "entropy": 8.987045288085938, "epoch": 0.4081471228000791, "mean_token_accuracy": 0.7296416759490967, "num_tokens": 22691287.0, "step": 4128, "train/ce_loss": 0.8535547256469727 }, { "epoch": 0.4081471228000791, "step": 4128, "train/sim_loss": 0.07421875 }, { "epoch": 0.4081471228000791, "step": 4128, "train/total_loss": 0.1595742255449295 }, { "entropy": 9.21452522277832, "epoch": 0.40824599564959463, "mean_token_accuracy": 0.7942029237747192, "num_tokens": 22696520.0, "step": 4129, "train/ce_loss": 0.7001322507858276 }, { "epoch": 0.40824599564959463, "step": 4129, "train/sim_loss": 0.03515625 }, { "epoch": 0.40824599564959463, "step": 4129, "train/total_loss": 0.10516947507858276 }, { "entropy": 8.792745590209961, "epoch": 0.40834486849911017, "mean_token_accuracy": 0.7518072128295898, "num_tokens": 22701973.0, "step": 4130, "train/ce_loss": 0.8250725269317627 }, { "epoch": 0.40834486849911017, "step": 4130, "train/sim_loss": 0.08984375 }, { "epoch": 0.40834486849911017, "step": 4130, "train/total_loss": 0.17235100269317627 }, { "entropy": 8.994044303894043, "epoch": 0.40844374134862566, "mean_token_accuracy": 0.7592592835426331, "num_tokens": 22707608.0, "step": 4131, "train/ce_loss": 0.7759838104248047 }, { "epoch": 0.40844374134862566, "step": 4131, "train/sim_loss": 0.0703125 }, { "epoch": 0.40844374134862566, "step": 4131, "train/total_loss": 0.14791089296340942 }, { "entropy": 8.98423957824707, "epoch": 0.4085426141981412, "mean_token_accuracy": 0.7666231989860535, "num_tokens": 22712930.0, "step": 4132, "train/ce_loss": 0.8789015412330627 }, { "epoch": 0.4085426141981412, "step": 4132, "train/sim_loss": 0.04296875 }, { "epoch": 0.4085426141981412, "step": 4132, "train/total_loss": 0.1308588981628418 }, { "entropy": 8.739225387573242, "epoch": 0.40864148704765674, "mean_token_accuracy": 0.6954976320266724, "num_tokens": 22718392.0, "step": 4133, "train/ce_loss": 0.6550987362861633 }, { "epoch": 0.40864148704765674, "step": 4133, "train/sim_loss": 0.12109375 }, { "epoch": 0.40864148704765674, "step": 4133, "train/total_loss": 0.1866036355495453 }, { "entropy": 8.516159057617188, "epoch": 0.4087403598971722, "mean_token_accuracy": 0.6860759258270264, "num_tokens": 22724078.0, "step": 4134, "train/ce_loss": 0.3341137766838074 }, { "epoch": 0.4087403598971722, "step": 4134, "train/sim_loss": 0.03125 }, { "epoch": 0.4087403598971722, "step": 4134, "train/total_loss": 0.06466138362884521 }, { "entropy": 9.134689331054688, "epoch": 0.40883923274668776, "mean_token_accuracy": 0.7359198927879333, "num_tokens": 22729452.0, "step": 4135, "train/ce_loss": 0.583132803440094 }, { "epoch": 0.40883923274668776, "step": 4135, "train/sim_loss": 0.03125 }, { "epoch": 0.40883923274668776, "step": 4135, "train/total_loss": 0.0895632803440094 }, { "entropy": 8.514545440673828, "epoch": 0.4089381055962033, "mean_token_accuracy": 0.7384615540504456, "num_tokens": 22735273.0, "step": 4136, "train/ce_loss": 0.38694584369659424 }, { "epoch": 0.4089381055962033, "step": 4136, "train/sim_loss": 0.05859375 }, { "epoch": 0.4089381055962033, "step": 4136, "train/total_loss": 0.0972883403301239 }, { "entropy": 8.831079483032227, "epoch": 0.4090369784457188, "mean_token_accuracy": 0.6861538290977478, "num_tokens": 22740786.0, "step": 4137, "train/ce_loss": 1.0157281160354614 }, { "epoch": 0.4090369784457188, "step": 4137, "train/sim_loss": 0.0625 }, { "epoch": 0.4090369784457188, "step": 4137, "train/total_loss": 0.16407281160354614 }, { "entropy": 9.200315475463867, "epoch": 0.40913585129523433, "mean_token_accuracy": 0.7482993006706238, "num_tokens": 22746101.0, "step": 4138, "train/ce_loss": 1.3313082456588745 }, { "epoch": 0.40913585129523433, "step": 4138, "train/sim_loss": 0.05859375 }, { "epoch": 0.40913585129523433, "step": 4138, "train/total_loss": 0.19172458350658417 }, { "entropy": 8.980684280395508, "epoch": 0.4092347241447499, "mean_token_accuracy": 0.7827547788619995, "num_tokens": 22751634.0, "step": 4139, "train/ce_loss": 0.6188586950302124 }, { "epoch": 0.4092347241447499, "step": 4139, "train/sim_loss": 0.05078125 }, { "epoch": 0.4092347241447499, "step": 4139, "train/total_loss": 0.11266712099313736 }, { "epoch": 0.40933359699426536, "grad_norm": 0.648201584815979, "learning_rate": 8.979132670721456e-06, "loss": 0.1357, "step": 4140 }, { "entropy": 8.76017951965332, "epoch": 0.40933359699426536, "mean_token_accuracy": 0.7443298697471619, "num_tokens": 22757190.0, "step": 4140, "train/ce_loss": 1.199635624885559 }, { "epoch": 0.40933359699426536, "step": 4140, "train/sim_loss": 0.05078125 }, { "epoch": 0.40933359699426536, "step": 4140, "train/total_loss": 0.17074480652809143 }, { "entropy": 8.709144592285156, "epoch": 0.4094324698437809, "mean_token_accuracy": 0.7178947329521179, "num_tokens": 22762774.0, "step": 4141, "train/ce_loss": 1.009209394454956 }, { "epoch": 0.4094324698437809, "step": 4141, "train/sim_loss": 0.11328125 }, { "epoch": 0.4094324698437809, "step": 4141, "train/total_loss": 0.21420219540596008 }, { "entropy": 8.878829956054688, "epoch": 0.40953134269329644, "mean_token_accuracy": 0.7486910820007324, "num_tokens": 22768171.0, "step": 4142, "train/ce_loss": 0.9017958641052246 }, { "epoch": 0.40953134269329644, "step": 4142, "train/sim_loss": 0.06640625 }, { "epoch": 0.40953134269329644, "step": 4142, "train/total_loss": 0.15658584237098694 }, { "entropy": 8.762718200683594, "epoch": 0.4096302155428119, "mean_token_accuracy": 0.772951602935791, "num_tokens": 22773772.0, "step": 4143, "train/ce_loss": 0.4566960334777832 }, { "epoch": 0.4096302155428119, "step": 4143, "train/sim_loss": 0.0390625 }, { "epoch": 0.4096302155428119, "step": 4143, "train/total_loss": 0.08473210036754608 }, { "entropy": 8.545368194580078, "epoch": 0.40972908839232747, "mean_token_accuracy": 0.7429760694503784, "num_tokens": 22779389.0, "step": 4144, "train/ce_loss": 0.9505503177642822 }, { "epoch": 0.40972908839232747, "step": 4144, "train/sim_loss": 0.0625 }, { "epoch": 0.40972908839232747, "step": 4144, "train/total_loss": 0.15755504369735718 }, { "entropy": 9.183416366577148, "epoch": 0.409827961241843, "mean_token_accuracy": 0.7831325531005859, "num_tokens": 22784815.0, "step": 4145, "train/ce_loss": 0.5968858003616333 }, { "epoch": 0.409827961241843, "step": 4145, "train/sim_loss": 0.0625 }, { "epoch": 0.409827961241843, "step": 4145, "train/total_loss": 0.12218858301639557 }, { "entropy": 8.564065933227539, "epoch": 0.4099268340913585, "mean_token_accuracy": 0.7684310078620911, "num_tokens": 22790535.0, "step": 4146, "train/ce_loss": 0.3681793510913849 }, { "epoch": 0.4099268340913585, "step": 4146, "train/sim_loss": 0.0546875 }, { "epoch": 0.4099268340913585, "step": 4146, "train/total_loss": 0.09150543808937073 }, { "entropy": 9.078124046325684, "epoch": 0.41002570694087404, "mean_token_accuracy": 0.7636849284172058, "num_tokens": 22795847.0, "step": 4147, "train/ce_loss": 0.8584815263748169 }, { "epoch": 0.41002570694087404, "step": 4147, "train/sim_loss": 0.05859375 }, { "epoch": 0.41002570694087404, "step": 4147, "train/total_loss": 0.1444419026374817 }, { "entropy": 8.912630081176758, "epoch": 0.4101245797903896, "mean_token_accuracy": 0.7661470174789429, "num_tokens": 22801358.0, "step": 4148, "train/ce_loss": 0.7020477056503296 }, { "epoch": 0.4101245797903896, "step": 4148, "train/sim_loss": 0.05078125 }, { "epoch": 0.4101245797903896, "step": 4148, "train/total_loss": 0.12098602205514908 }, { "entropy": 8.54112434387207, "epoch": 0.41022345263990506, "mean_token_accuracy": 0.8174442052841187, "num_tokens": 22806985.0, "step": 4149, "train/ce_loss": 0.5083780884742737 }, { "epoch": 0.41022345263990506, "step": 4149, "train/sim_loss": 0.01953125 }, { "epoch": 0.41022345263990506, "step": 4149, "train/total_loss": 0.07036906480789185 }, { "entropy": 8.474512100219727, "epoch": 0.4103223254894206, "mean_token_accuracy": 0.7731225490570068, "num_tokens": 22812783.0, "step": 4150, "train/ce_loss": 1.5747147798538208 }, { "epoch": 0.4103223254894206, "step": 4150, "train/sim_loss": 0.0546875 }, { "epoch": 0.4103223254894206, "step": 4150, "train/total_loss": 0.21215897798538208 }, { "entropy": 9.214333534240723, "epoch": 0.41042119833893614, "mean_token_accuracy": 0.7094499468803406, "num_tokens": 22818121.0, "step": 4151, "train/ce_loss": 0.5467510223388672 }, { "epoch": 0.41042119833893614, "step": 4151, "train/sim_loss": 0.05859375 }, { "epoch": 0.41042119833893614, "step": 4151, "train/total_loss": 0.11326885223388672 }, { "entropy": 9.162769317626953, "epoch": 0.41052007118845163, "mean_token_accuracy": 0.7178527116775513, "num_tokens": 22823588.0, "step": 4152, "train/ce_loss": 1.0939208269119263 }, { "epoch": 0.41052007118845163, "step": 4152, "train/sim_loss": 0.09375 }, { "epoch": 0.41052007118845163, "step": 4152, "train/total_loss": 0.20314207673072815 }, { "entropy": 9.365636825561523, "epoch": 0.41061894403796717, "mean_token_accuracy": 0.7366254925727844, "num_tokens": 22828927.0, "step": 4153, "train/ce_loss": 0.6787307858467102 }, { "epoch": 0.41061894403796717, "step": 4153, "train/sim_loss": 0.109375 }, { "epoch": 0.41061894403796717, "step": 4153, "train/total_loss": 0.17724809050559998 }, { "entropy": 8.892210006713867, "epoch": 0.4107178168874827, "mean_token_accuracy": 0.7082872986793518, "num_tokens": 22834412.0, "step": 4154, "train/ce_loss": 0.7896271347999573 }, { "epoch": 0.4107178168874827, "step": 4154, "train/sim_loss": 0.0625 }, { "epoch": 0.4107178168874827, "step": 4154, "train/total_loss": 0.14146271347999573 }, { "entropy": 9.176536560058594, "epoch": 0.4108166897369982, "mean_token_accuracy": 0.791891872882843, "num_tokens": 22839759.0, "step": 4155, "train/ce_loss": 0.6063438057899475 }, { "epoch": 0.4108166897369982, "step": 4155, "train/sim_loss": 0.046875 }, { "epoch": 0.4108166897369982, "step": 4155, "train/total_loss": 0.10750938206911087 }, { "entropy": 8.94602108001709, "epoch": 0.41091556258651374, "mean_token_accuracy": 0.6874265670776367, "num_tokens": 22845186.0, "step": 4156, "train/ce_loss": 1.2373087406158447 }, { "epoch": 0.41091556258651374, "step": 4156, "train/sim_loss": 0.13671875 }, { "epoch": 0.41091556258651374, "step": 4156, "train/total_loss": 0.26044961810112 }, { "entropy": 8.250667572021484, "epoch": 0.4110144354360293, "mean_token_accuracy": 0.6746807098388672, "num_tokens": 22851144.0, "step": 4157, "train/ce_loss": 0.8668200969696045 }, { "epoch": 0.4110144354360293, "step": 4157, "train/sim_loss": 0.1015625 }, { "epoch": 0.4110144354360293, "step": 4157, "train/total_loss": 0.1882445216178894 }, { "entropy": 9.196889877319336, "epoch": 0.41111330828554477, "mean_token_accuracy": 0.7469553351402283, "num_tokens": 22856498.0, "step": 4158, "train/ce_loss": 0.6971787214279175 }, { "epoch": 0.41111330828554477, "step": 4158, "train/sim_loss": 0.05078125 }, { "epoch": 0.41111330828554477, "step": 4158, "train/total_loss": 0.1204991266131401 }, { "entropy": 9.034019470214844, "epoch": 0.4112121811350603, "mean_token_accuracy": 0.7892767786979675, "num_tokens": 22861967.0, "step": 4159, "train/ce_loss": 0.9186177849769592 }, { "epoch": 0.4112121811350603, "step": 4159, "train/sim_loss": 0.109375 }, { "epoch": 0.4112121811350603, "step": 4159, "train/total_loss": 0.2012367844581604 }, { "epoch": 0.41131105398457585, "grad_norm": 0.7836120128631592, "learning_rate": 8.974187805963509e-06, "loss": 0.1397, "step": 4160 }, { "entropy": 9.037981033325195, "epoch": 0.41131105398457585, "mean_token_accuracy": 0.7078279852867126, "num_tokens": 22867494.0, "step": 4160, "train/ce_loss": 1.2459080219268799 }, { "epoch": 0.41131105398457585, "step": 4160, "train/sim_loss": 0.0546875 }, { "epoch": 0.41131105398457585, "step": 4160, "train/total_loss": 0.17927831411361694 }, { "entropy": 8.93850326538086, "epoch": 0.41140992683409133, "mean_token_accuracy": 0.740276038646698, "num_tokens": 22872923.0, "step": 4161, "train/ce_loss": 0.7533289790153503 }, { "epoch": 0.41140992683409133, "step": 4161, "train/sim_loss": 0.06640625 }, { "epoch": 0.41140992683409133, "step": 4161, "train/total_loss": 0.141739159822464 }, { "entropy": 8.819135665893555, "epoch": 0.4115087996836069, "mean_token_accuracy": 0.7562296986579895, "num_tokens": 22878504.0, "step": 4162, "train/ce_loss": 0.5104396939277649 }, { "epoch": 0.4115087996836069, "step": 4162, "train/sim_loss": 0.0625 }, { "epoch": 0.4115087996836069, "step": 4162, "train/total_loss": 0.11354397237300873 }, { "entropy": 8.771127700805664, "epoch": 0.4116076725331224, "mean_token_accuracy": 0.727642297744751, "num_tokens": 22884082.0, "step": 4163, "train/ce_loss": 1.2084472179412842 }, { "epoch": 0.4116076725331224, "step": 4163, "train/sim_loss": 0.046875 }, { "epoch": 0.4116076725331224, "step": 4163, "train/total_loss": 0.16771972179412842 }, { "entropy": 9.324857711791992, "epoch": 0.4117065453826379, "mean_token_accuracy": 0.7613292932510376, "num_tokens": 22889374.0, "step": 4164, "train/ce_loss": 0.8078370094299316 }, { "epoch": 0.4117065453826379, "step": 4164, "train/sim_loss": 0.0703125 }, { "epoch": 0.4117065453826379, "step": 4164, "train/total_loss": 0.1510961949825287 }, { "entropy": 8.63064193725586, "epoch": 0.41180541823215344, "mean_token_accuracy": 0.7689873576164246, "num_tokens": 22894944.0, "step": 4165, "train/ce_loss": 0.83395916223526 }, { "epoch": 0.41180541823215344, "step": 4165, "train/sim_loss": 0.03125 }, { "epoch": 0.41180541823215344, "step": 4165, "train/total_loss": 0.11464592069387436 }, { "entropy": 9.049284934997559, "epoch": 0.411904291081669, "mean_token_accuracy": 0.7558860182762146, "num_tokens": 22900377.0, "step": 4166, "train/ce_loss": 0.5424871444702148 }, { "epoch": 0.411904291081669, "step": 4166, "train/sim_loss": 0.06640625 }, { "epoch": 0.411904291081669, "step": 4166, "train/total_loss": 0.12065497040748596 }, { "entropy": 8.693094253540039, "epoch": 0.4120031639311845, "mean_token_accuracy": 0.7062146663665771, "num_tokens": 22905953.0, "step": 4167, "train/ce_loss": 0.4301653206348419 }, { "epoch": 0.4120031639311845, "step": 4167, "train/sim_loss": 0.05078125 }, { "epoch": 0.4120031639311845, "step": 4167, "train/total_loss": 0.09379778802394867 }, { "entropy": 8.809514999389648, "epoch": 0.4121020367807, "mean_token_accuracy": 0.7422062158584595, "num_tokens": 22911383.0, "step": 4168, "train/ce_loss": 0.7450334429740906 }, { "epoch": 0.4121020367807, "step": 4168, "train/sim_loss": 0.046875 }, { "epoch": 0.4121020367807, "step": 4168, "train/total_loss": 0.1213783472776413 }, { "entropy": 8.856356620788574, "epoch": 0.41220090963021555, "mean_token_accuracy": 0.7246695756912231, "num_tokens": 22916946.0, "step": 4169, "train/ce_loss": 0.9119495153427124 }, { "epoch": 0.41220090963021555, "step": 4169, "train/sim_loss": 0.0390625 }, { "epoch": 0.41220090963021555, "step": 4169, "train/total_loss": 0.13025745749473572 }, { "entropy": 9.106440544128418, "epoch": 0.4122997824797311, "mean_token_accuracy": 0.7670682668685913, "num_tokens": 22922367.0, "step": 4170, "train/ce_loss": 0.5735121369361877 }, { "epoch": 0.4122997824797311, "step": 4170, "train/sim_loss": 0.02734375 }, { "epoch": 0.4122997824797311, "step": 4170, "train/total_loss": 0.08469496667385101 }, { "entropy": 8.805926322937012, "epoch": 0.4123986553292466, "mean_token_accuracy": 0.7221006751060486, "num_tokens": 22927933.0, "step": 4171, "train/ce_loss": 1.3707829713821411 }, { "epoch": 0.4123986553292466, "step": 4171, "train/sim_loss": 0.09375 }, { "epoch": 0.4123986553292466, "step": 4171, "train/total_loss": 0.23082830011844635 }, { "entropy": 8.759446144104004, "epoch": 0.4124975281787621, "mean_token_accuracy": 0.800000011920929, "num_tokens": 22933545.0, "step": 4172, "train/ce_loss": 0.8408647179603577 }, { "epoch": 0.4124975281787621, "step": 4172, "train/sim_loss": 0.06640625 }, { "epoch": 0.4124975281787621, "step": 4172, "train/total_loss": 0.15049272775650024 }, { "entropy": 8.932116508483887, "epoch": 0.41259640102827766, "mean_token_accuracy": 0.7223340272903442, "num_tokens": 22939044.0, "step": 4173, "train/ce_loss": 0.46019256114959717 }, { "epoch": 0.41259640102827766, "step": 4173, "train/sim_loss": 0.0234375 }, { "epoch": 0.41259640102827766, "step": 4173, "train/total_loss": 0.06945675611495972 }, { "entropy": 8.290313720703125, "epoch": 0.41269527387779315, "mean_token_accuracy": 0.7035316228866577, "num_tokens": 22944654.0, "step": 4174, "train/ce_loss": 0.8584254384040833 }, { "epoch": 0.41269527387779315, "step": 4174, "train/sim_loss": 0.03515625 }, { "epoch": 0.41269527387779315, "step": 4174, "train/total_loss": 0.1209987923502922 }, { "entropy": 9.114212036132812, "epoch": 0.4127941467273087, "mean_token_accuracy": 0.7262773513793945, "num_tokens": 22950271.0, "step": 4175, "train/ce_loss": 0.6852434873580933 }, { "epoch": 0.4127941467273087, "step": 4175, "train/sim_loss": 0.0546875 }, { "epoch": 0.4127941467273087, "step": 4175, "train/total_loss": 0.12321185320615768 }, { "entropy": 9.115262985229492, "epoch": 0.4128930195768242, "mean_token_accuracy": 0.7453581094741821, "num_tokens": 22955633.0, "step": 4176, "train/ce_loss": 0.735322117805481 }, { "epoch": 0.4128930195768242, "step": 4176, "train/sim_loss": 0.08203125 }, { "epoch": 0.4128930195768242, "step": 4176, "train/total_loss": 0.15556347370147705 }, { "entropy": 9.032724380493164, "epoch": 0.4129918924263397, "mean_token_accuracy": 0.7967581152915955, "num_tokens": 22961079.0, "step": 4177, "train/ce_loss": 0.9530428647994995 }, { "epoch": 0.4129918924263397, "step": 4177, "train/sim_loss": 0.046875 }, { "epoch": 0.4129918924263397, "step": 4177, "train/total_loss": 0.14217928051948547 }, { "entropy": 8.924708366394043, "epoch": 0.41309076527585525, "mean_token_accuracy": 0.7985074520111084, "num_tokens": 22966454.0, "step": 4178, "train/ce_loss": 0.5434448719024658 }, { "epoch": 0.41309076527585525, "step": 4178, "train/sim_loss": 0.08984375 }, { "epoch": 0.41309076527585525, "step": 4178, "train/total_loss": 0.14418824017047882 }, { "entropy": 9.047996520996094, "epoch": 0.4131896381253708, "mean_token_accuracy": 0.7061790823936462, "num_tokens": 22971903.0, "step": 4179, "train/ce_loss": 1.1025748252868652 }, { "epoch": 0.4131896381253708, "step": 4179, "train/sim_loss": 0.07421875 }, { "epoch": 0.4131896381253708, "step": 4179, "train/total_loss": 0.18447622656822205 }, { "epoch": 0.4132885109748863, "grad_norm": 0.8070016503334045, "learning_rate": 8.969242941205559e-06, "loss": 0.1381, "step": 4180 }, { "entropy": 8.688820838928223, "epoch": 0.4132885109748863, "mean_token_accuracy": 0.6872385144233704, "num_tokens": 22977491.0, "step": 4180, "train/ce_loss": 1.1605677604675293 }, { "epoch": 0.4132885109748863, "step": 4180, "train/sim_loss": 0.0546875 }, { "epoch": 0.4132885109748863, "step": 4180, "train/total_loss": 0.17074427008628845 }, { "entropy": 8.878438949584961, "epoch": 0.4133873838244018, "mean_token_accuracy": 0.772009015083313, "num_tokens": 22983005.0, "step": 4181, "train/ce_loss": 0.47543230652809143 }, { "epoch": 0.4133873838244018, "step": 4181, "train/sim_loss": 0.02734375 }, { "epoch": 0.4133873838244018, "step": 4181, "train/total_loss": 0.0748869776725769 }, { "entropy": 9.071276664733887, "epoch": 0.41348625667391736, "mean_token_accuracy": 0.7204433679580688, "num_tokens": 22988393.0, "step": 4182, "train/ce_loss": 0.48252326250076294 }, { "epoch": 0.41348625667391736, "step": 4182, "train/sim_loss": 0.02734375 }, { "epoch": 0.41348625667391736, "step": 4182, "train/total_loss": 0.07559607923030853 }, { "entropy": 9.099968910217285, "epoch": 0.41358512952343285, "mean_token_accuracy": 0.7438119053840637, "num_tokens": 22993777.0, "step": 4183, "train/ce_loss": 0.6840600967407227 }, { "epoch": 0.41358512952343285, "step": 4183, "train/sim_loss": 0.0234375 }, { "epoch": 0.41358512952343285, "step": 4183, "train/total_loss": 0.09184350818395615 }, { "entropy": 8.829450607299805, "epoch": 0.4136840023729484, "mean_token_accuracy": 0.7361751198768616, "num_tokens": 22999279.0, "step": 4184, "train/ce_loss": 0.7415414452552795 }, { "epoch": 0.4136840023729484, "step": 4184, "train/sim_loss": 0.0625 }, { "epoch": 0.4136840023729484, "step": 4184, "train/total_loss": 0.13665413856506348 }, { "entropy": 8.596948623657227, "epoch": 0.41378287522246393, "mean_token_accuracy": 0.6824925541877747, "num_tokens": 23004842.0, "step": 4185, "train/ce_loss": 2.1103968620300293 }, { "epoch": 0.41378287522246393, "step": 4185, "train/sim_loss": 0.0703125 }, { "epoch": 0.41378287522246393, "step": 4185, "train/total_loss": 0.2813521921634674 }, { "entropy": 9.22066879272461, "epoch": 0.4138817480719794, "mean_token_accuracy": 0.7266387939453125, "num_tokens": 23010182.0, "step": 4186, "train/ce_loss": 0.5803707838058472 }, { "epoch": 0.4138817480719794, "step": 4186, "train/sim_loss": 0.078125 }, { "epoch": 0.4138817480719794, "step": 4186, "train/total_loss": 0.13616207242012024 }, { "entropy": 9.034416198730469, "epoch": 0.41398062092149496, "mean_token_accuracy": 0.7035670280456543, "num_tokens": 23015671.0, "step": 4187, "train/ce_loss": 1.157197117805481 }, { "epoch": 0.41398062092149496, "step": 4187, "train/sim_loss": 0.0234375 }, { "epoch": 0.41398062092149496, "step": 4187, "train/total_loss": 0.13915720582008362 }, { "entropy": 8.874421119689941, "epoch": 0.4140794937710105, "mean_token_accuracy": 0.7887789011001587, "num_tokens": 23021250.0, "step": 4188, "train/ce_loss": 0.5605764389038086 }, { "epoch": 0.4140794937710105, "step": 4188, "train/sim_loss": 0.0625 }, { "epoch": 0.4140794937710105, "step": 4188, "train/total_loss": 0.1185576468706131 }, { "entropy": 8.830521583557129, "epoch": 0.414178366620526, "mean_token_accuracy": 0.7254464030265808, "num_tokens": 23026739.0, "step": 4189, "train/ce_loss": 0.562842607498169 }, { "epoch": 0.414178366620526, "step": 4189, "train/sim_loss": 0.0546875 }, { "epoch": 0.414178366620526, "step": 4189, "train/total_loss": 0.11097176373004913 }, { "entropy": 9.023571014404297, "epoch": 0.4142772394700415, "mean_token_accuracy": 0.7732620239257812, "num_tokens": 23032505.0, "step": 4190, "train/ce_loss": 0.46833616495132446 }, { "epoch": 0.4142772394700415, "step": 4190, "train/sim_loss": 0.078125 }, { "epoch": 0.4142772394700415, "step": 4190, "train/total_loss": 0.12495861947536469 }, { "entropy": 8.746211051940918, "epoch": 0.41437611231955707, "mean_token_accuracy": 0.7297896146774292, "num_tokens": 23038027.0, "step": 4191, "train/ce_loss": 0.8160730004310608 }, { "epoch": 0.41437611231955707, "step": 4191, "train/sim_loss": 0.0703125 }, { "epoch": 0.41437611231955707, "step": 4191, "train/total_loss": 0.15191981196403503 }, { "entropy": 8.998167037963867, "epoch": 0.41447498516907255, "mean_token_accuracy": 0.7768595218658447, "num_tokens": 23043466.0, "step": 4192, "train/ce_loss": 0.5354365706443787 }, { "epoch": 0.41447498516907255, "step": 4192, "train/sim_loss": 0.09375 }, { "epoch": 0.41447498516907255, "step": 4192, "train/total_loss": 0.14729365706443787 }, { "entropy": 9.385431289672852, "epoch": 0.4145738580185881, "mean_token_accuracy": 0.7267355918884277, "num_tokens": 23048803.0, "step": 4193, "train/ce_loss": 0.5801143050193787 }, { "epoch": 0.4145738580185881, "step": 4193, "train/sim_loss": 0.15234375 }, { "epoch": 0.4145738580185881, "step": 4193, "train/total_loss": 0.21035517752170563 }, { "entropy": 9.077079772949219, "epoch": 0.41467273086810363, "mean_token_accuracy": 0.75, "num_tokens": 23054185.0, "step": 4194, "train/ce_loss": 1.447477102279663 }, { "epoch": 0.41467273086810363, "step": 4194, "train/sim_loss": 0.12890625 }, { "epoch": 0.41467273086810363, "step": 4194, "train/total_loss": 0.2736539840698242 }, { "entropy": 8.735644340515137, "epoch": 0.4147716037176191, "mean_token_accuracy": 0.7565789222717285, "num_tokens": 23059727.0, "step": 4195, "train/ce_loss": 0.5633814334869385 }, { "epoch": 0.4147716037176191, "step": 4195, "train/sim_loss": 0.0234375 }, { "epoch": 0.4147716037176191, "step": 4195, "train/total_loss": 0.07977564632892609 }, { "entropy": 8.62299919128418, "epoch": 0.41487047656713466, "mean_token_accuracy": 0.7173174619674683, "num_tokens": 23065458.0, "step": 4196, "train/ce_loss": 0.5042151212692261 }, { "epoch": 0.41487047656713466, "step": 4196, "train/sim_loss": 0.05859375 }, { "epoch": 0.41487047656713466, "step": 4196, "train/total_loss": 0.10901526361703873 }, { "entropy": 8.753923416137695, "epoch": 0.4149693494166502, "mean_token_accuracy": 0.7338709831237793, "num_tokens": 23071132.0, "step": 4197, "train/ce_loss": 1.0221500396728516 }, { "epoch": 0.4149693494166502, "step": 4197, "train/sim_loss": 0.04296875 }, { "epoch": 0.4149693494166502, "step": 4197, "train/total_loss": 0.1451837569475174 }, { "entropy": 9.21451187133789, "epoch": 0.4150682222661657, "mean_token_accuracy": 0.7978141903877258, "num_tokens": 23076480.0, "step": 4198, "train/ce_loss": 0.6094051599502563 }, { "epoch": 0.4150682222661657, "step": 4198, "train/sim_loss": 0.0234375 }, { "epoch": 0.4150682222661657, "step": 4198, "train/total_loss": 0.08437801897525787 }, { "entropy": 8.68425178527832, "epoch": 0.41516709511568123, "mean_token_accuracy": 0.6807935237884521, "num_tokens": 23082152.0, "step": 4199, "train/ce_loss": 1.9778947830200195 }, { "epoch": 0.41516709511568123, "step": 4199, "train/sim_loss": 0.0625 }, { "epoch": 0.41516709511568123, "step": 4199, "train/total_loss": 0.2602894902229309 }, { "epoch": 0.41526596796519677, "grad_norm": 0.8068773150444031, "learning_rate": 8.96429807644761e-06, "loss": 0.1452, "step": 4200 }, { "entropy": 8.407499313354492, "epoch": 0.41526596796519677, "mean_token_accuracy": 0.7395715713500977, "num_tokens": 23087668.0, "step": 4200, "train/ce_loss": 1.0081496238708496 }, { "epoch": 0.41526596796519677, "step": 4200, "train/sim_loss": 0.078125 }, { "epoch": 0.41526596796519677, "step": 4200, "train/total_loss": 0.17893996834754944 }, { "entropy": 8.941858291625977, "epoch": 0.41536484081471226, "mean_token_accuracy": 0.7220843434333801, "num_tokens": 23093087.0, "step": 4201, "train/ce_loss": 0.8946034908294678 }, { "epoch": 0.41536484081471226, "step": 4201, "train/sim_loss": 0.0390625 }, { "epoch": 0.41536484081471226, "step": 4201, "train/total_loss": 0.1285228431224823 }, { "entropy": 9.623502731323242, "epoch": 0.4154637136642278, "mean_token_accuracy": 0.796875, "num_tokens": 23098337.0, "step": 4202, "train/ce_loss": 0.621929943561554 }, { "epoch": 0.4154637136642278, "step": 4202, "train/sim_loss": 0.109375 }, { "epoch": 0.4154637136642278, "step": 4202, "train/total_loss": 0.17156799137592316 }, { "entropy": 8.799476623535156, "epoch": 0.41556258651374334, "mean_token_accuracy": 0.7425860166549683, "num_tokens": 23103803.0, "step": 4203, "train/ce_loss": 1.0786923170089722 }, { "epoch": 0.41556258651374334, "step": 4203, "train/sim_loss": 0.03515625 }, { "epoch": 0.41556258651374334, "step": 4203, "train/total_loss": 0.1430254876613617 }, { "entropy": 9.200511932373047, "epoch": 0.4156614593632588, "mean_token_accuracy": 0.6982248425483704, "num_tokens": 23109042.0, "step": 4204, "train/ce_loss": 1.1861149072647095 }, { "epoch": 0.4156614593632588, "step": 4204, "train/sim_loss": 0.109375 }, { "epoch": 0.4156614593632588, "step": 4204, "train/total_loss": 0.22798648476600647 }, { "entropy": 8.956999778747559, "epoch": 0.41576033221277436, "mean_token_accuracy": 0.7197231650352478, "num_tokens": 23114537.0, "step": 4205, "train/ce_loss": 0.6161093711853027 }, { "epoch": 0.41576033221277436, "step": 4205, "train/sim_loss": 0.0390625 }, { "epoch": 0.41576033221277436, "step": 4205, "train/total_loss": 0.10067343711853027 }, { "entropy": 9.106650352478027, "epoch": 0.4158592050622899, "mean_token_accuracy": 0.7846332788467407, "num_tokens": 23119982.0, "step": 4206, "train/ce_loss": 0.6685699820518494 }, { "epoch": 0.4158592050622899, "step": 4206, "train/sim_loss": 0.05078125 }, { "epoch": 0.4158592050622899, "step": 4206, "train/total_loss": 0.1176382526755333 }, { "entropy": 8.773327827453613, "epoch": 0.4159580779118054, "mean_token_accuracy": 0.7105831503868103, "num_tokens": 23125517.0, "step": 4207, "train/ce_loss": 0.8112591505050659 }, { "epoch": 0.4159580779118054, "step": 4207, "train/sim_loss": 0.08984375 }, { "epoch": 0.4159580779118054, "step": 4207, "train/total_loss": 0.1709696650505066 }, { "entropy": 8.817028045654297, "epoch": 0.41605695076132093, "mean_token_accuracy": 0.7162629961967468, "num_tokens": 23131015.0, "step": 4208, "train/ce_loss": 0.7026969790458679 }, { "epoch": 0.41605695076132093, "step": 4208, "train/sim_loss": 0.16015625 }, { "epoch": 0.41605695076132093, "step": 4208, "train/total_loss": 0.23042595386505127 }, { "entropy": 8.738761901855469, "epoch": 0.4161558236108365, "mean_token_accuracy": 0.7557557821273804, "num_tokens": 23136635.0, "step": 4209, "train/ce_loss": 0.40469929575920105 }, { "epoch": 0.4161558236108365, "step": 4209, "train/sim_loss": 0.05078125 }, { "epoch": 0.4161558236108365, "step": 4209, "train/total_loss": 0.0912511795759201 }, { "entropy": 8.749971389770508, "epoch": 0.416254696460352, "mean_token_accuracy": 0.7175379395484924, "num_tokens": 23142431.0, "step": 4210, "train/ce_loss": 0.7819333076477051 }, { "epoch": 0.416254696460352, "step": 4210, "train/sim_loss": 0.09375 }, { "epoch": 0.416254696460352, "step": 4210, "train/total_loss": 0.17194333672523499 }, { "entropy": 8.744904518127441, "epoch": 0.4163535693098675, "mean_token_accuracy": 0.7165005803108215, "num_tokens": 23147991.0, "step": 4211, "train/ce_loss": 0.7356314659118652 }, { "epoch": 0.4163535693098675, "step": 4211, "train/sim_loss": 0.046875 }, { "epoch": 0.4163535693098675, "step": 4211, "train/total_loss": 0.12043815106153488 }, { "entropy": 8.895011901855469, "epoch": 0.41645244215938304, "mean_token_accuracy": 0.7491821050643921, "num_tokens": 23153409.0, "step": 4212, "train/ce_loss": 0.9463136792182922 }, { "epoch": 0.41645244215938304, "step": 4212, "train/sim_loss": 0.09375 }, { "epoch": 0.41645244215938304, "step": 4212, "train/total_loss": 0.1883813738822937 }, { "entropy": 9.123252868652344, "epoch": 0.4165513150088986, "mean_token_accuracy": 0.7997416257858276, "num_tokens": 23158824.0, "step": 4213, "train/ce_loss": 0.47096017003059387 }, { "epoch": 0.4165513150088986, "step": 4213, "train/sim_loss": 0.0234375 }, { "epoch": 0.4165513150088986, "step": 4213, "train/total_loss": 0.07053351402282715 }, { "entropy": 8.882776260375977, "epoch": 0.41665018785841407, "mean_token_accuracy": 0.7116374969482422, "num_tokens": 23164445.0, "step": 4214, "train/ce_loss": 0.8234058618545532 }, { "epoch": 0.41665018785841407, "step": 4214, "train/sim_loss": 0.09765625 }, { "epoch": 0.41665018785841407, "step": 4214, "train/total_loss": 0.17999684810638428 }, { "entropy": 9.210935592651367, "epoch": 0.4167490607079296, "mean_token_accuracy": 0.7121001482009888, "num_tokens": 23169776.0, "step": 4215, "train/ce_loss": 0.4492507874965668 }, { "epoch": 0.4167490607079296, "step": 4215, "train/sim_loss": 0.0625 }, { "epoch": 0.4167490607079296, "step": 4215, "train/total_loss": 0.10742507874965668 }, { "entropy": 8.903751373291016, "epoch": 0.41684793355744515, "mean_token_accuracy": 0.7115578055381775, "num_tokens": 23175399.0, "step": 4216, "train/ce_loss": 1.1486306190490723 }, { "epoch": 0.41684793355744515, "step": 4216, "train/sim_loss": 0.0703125 }, { "epoch": 0.41684793355744515, "step": 4216, "train/total_loss": 0.1851755678653717 }, { "entropy": 8.45914077758789, "epoch": 0.41694680640696064, "mean_token_accuracy": 0.7866344451904297, "num_tokens": 23181216.0, "step": 4217, "train/ce_loss": 0.35081347823143005 }, { "epoch": 0.41694680640696064, "step": 4217, "train/sim_loss": 0.02734375 }, { "epoch": 0.41694680640696064, "step": 4217, "train/total_loss": 0.062425099313259125 }, { "entropy": 8.864446640014648, "epoch": 0.4170456792564762, "mean_token_accuracy": 0.7581920623779297, "num_tokens": 23186749.0, "step": 4218, "train/ce_loss": 0.874320387840271 }, { "epoch": 0.4170456792564762, "step": 4218, "train/sim_loss": 0.05078125 }, { "epoch": 0.4170456792564762, "step": 4218, "train/total_loss": 0.13821329176425934 }, { "entropy": 8.959444046020508, "epoch": 0.4171445521059917, "mean_token_accuracy": 0.6905472874641418, "num_tokens": 23192295.0, "step": 4219, "train/ce_loss": 1.4770430326461792 }, { "epoch": 0.4171445521059917, "step": 4219, "train/sim_loss": 0.046875 }, { "epoch": 0.4171445521059917, "step": 4219, "train/total_loss": 0.19457930326461792 }, { "epoch": 0.4172434249555072, "grad_norm": 0.7804780006408691, "learning_rate": 8.95935321168966e-06, "loss": 0.1489, "step": 4220 }, { "entropy": 8.875007629394531, "epoch": 0.4172434249555072, "mean_token_accuracy": 0.7668638825416565, "num_tokens": 23197786.0, "step": 4220, "train/ce_loss": 0.830984890460968 }, { "epoch": 0.4172434249555072, "step": 4220, "train/sim_loss": 0.0234375 }, { "epoch": 0.4172434249555072, "step": 4220, "train/total_loss": 0.10653599351644516 }, { "entropy": 8.917108535766602, "epoch": 0.41734229780502274, "mean_token_accuracy": 0.7617865800857544, "num_tokens": 23203160.0, "step": 4221, "train/ce_loss": 0.7978419661521912 }, { "epoch": 0.41734229780502274, "step": 4221, "train/sim_loss": 0.0625 }, { "epoch": 0.41734229780502274, "step": 4221, "train/total_loss": 0.14228419959545135 }, { "entropy": 9.074207305908203, "epoch": 0.4174411706545383, "mean_token_accuracy": 0.7453917264938354, "num_tokens": 23208593.0, "step": 4222, "train/ce_loss": 0.7045638561248779 }, { "epoch": 0.4174411706545383, "step": 4222, "train/sim_loss": 0.04296875 }, { "epoch": 0.4174411706545383, "step": 4222, "train/total_loss": 0.11342513561248779 }, { "entropy": 8.763494491577148, "epoch": 0.41754004350405377, "mean_token_accuracy": 0.7488738894462585, "num_tokens": 23214089.0, "step": 4223, "train/ce_loss": 0.6462128162384033 }, { "epoch": 0.41754004350405377, "step": 4223, "train/sim_loss": 0.0390625 }, { "epoch": 0.41754004350405377, "step": 4223, "train/total_loss": 0.10368378460407257 }, { "entropy": 8.692676544189453, "epoch": 0.4176389163535693, "mean_token_accuracy": 0.7683049440383911, "num_tokens": 23219715.0, "step": 4224, "train/ce_loss": 0.5085378885269165 }, { "epoch": 0.4176389163535693, "step": 4224, "train/sim_loss": 0.03125 }, { "epoch": 0.4176389163535693, "step": 4224, "train/total_loss": 0.08210378885269165 }, { "entropy": 9.347648620605469, "epoch": 0.41773778920308485, "mean_token_accuracy": 0.7678321599960327, "num_tokens": 23225006.0, "step": 4225, "train/ce_loss": 0.49214184284210205 }, { "epoch": 0.41773778920308485, "step": 4225, "train/sim_loss": 0.0546875 }, { "epoch": 0.41773778920308485, "step": 4225, "train/total_loss": 0.1039016842842102 }, { "entropy": 8.790496826171875, "epoch": 0.41783666205260034, "mean_token_accuracy": 0.7182390093803406, "num_tokens": 23230451.0, "step": 4226, "train/ce_loss": 1.1140530109405518 }, { "epoch": 0.41783666205260034, "step": 4226, "train/sim_loss": 0.06640625 }, { "epoch": 0.41783666205260034, "step": 4226, "train/total_loss": 0.17781156301498413 }, { "entropy": 8.872556686401367, "epoch": 0.4179355349021159, "mean_token_accuracy": 0.7212499976158142, "num_tokens": 23235895.0, "step": 4227, "train/ce_loss": 0.8941662311553955 }, { "epoch": 0.4179355349021159, "step": 4227, "train/sim_loss": 0.0390625 }, { "epoch": 0.4179355349021159, "step": 4227, "train/total_loss": 0.12847912311553955 }, { "entropy": 9.353656768798828, "epoch": 0.4180344077516314, "mean_token_accuracy": 0.7390745282173157, "num_tokens": 23241201.0, "step": 4228, "train/ce_loss": 0.741772472858429 }, { "epoch": 0.4180344077516314, "step": 4228, "train/sim_loss": 0.0625 }, { "epoch": 0.4180344077516314, "step": 4228, "train/total_loss": 0.13667725026607513 }, { "entropy": 8.580613136291504, "epoch": 0.4181332806011469, "mean_token_accuracy": 0.7488095164299011, "num_tokens": 23246589.0, "step": 4229, "train/ce_loss": 1.2272883653640747 }, { "epoch": 0.4181332806011469, "step": 4229, "train/sim_loss": 0.08984375 }, { "epoch": 0.4181332806011469, "step": 4229, "train/total_loss": 0.2125725895166397 }, { "entropy": 8.76191520690918, "epoch": 0.41823215345066245, "mean_token_accuracy": 0.7296416759490967, "num_tokens": 23252081.0, "step": 4230, "train/ce_loss": 0.8529883623123169 }, { "epoch": 0.41823215345066245, "step": 4230, "train/sim_loss": 0.02734375 }, { "epoch": 0.41823215345066245, "step": 4230, "train/total_loss": 0.11264258623123169 }, { "entropy": 9.045295715332031, "epoch": 0.418331026300178, "mean_token_accuracy": 0.7622298002243042, "num_tokens": 23257514.0, "step": 4231, "train/ce_loss": 0.8010681867599487 }, { "epoch": 0.418331026300178, "step": 4231, "train/sim_loss": 0.125 }, { "epoch": 0.418331026300178, "step": 4231, "train/total_loss": 0.20510682463645935 }, { "entropy": 9.286943435668945, "epoch": 0.4184298991496935, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 23262794.0, "step": 4232, "train/ce_loss": 0.620381772518158 }, { "epoch": 0.4184298991496935, "step": 4232, "train/sim_loss": 0.1171875 }, { "epoch": 0.4184298991496935, "step": 4232, "train/total_loss": 0.17922568321228027 }, { "entropy": 8.972993850708008, "epoch": 0.418528771999209, "mean_token_accuracy": 0.7485241889953613, "num_tokens": 23268233.0, "step": 4233, "train/ce_loss": 0.6345102190971375 }, { "epoch": 0.418528771999209, "step": 4233, "train/sim_loss": 0.03515625 }, { "epoch": 0.418528771999209, "step": 4233, "train/total_loss": 0.09860727190971375 }, { "entropy": 8.957637786865234, "epoch": 0.41862764484872456, "mean_token_accuracy": 0.8171296119689941, "num_tokens": 23273684.0, "step": 4234, "train/ce_loss": 0.4282701313495636 }, { "epoch": 0.41862764484872456, "step": 4234, "train/sim_loss": 0.04296875 }, { "epoch": 0.41862764484872456, "step": 4234, "train/total_loss": 0.08579576015472412 }, { "entropy": 8.825374603271484, "epoch": 0.41872651769824004, "mean_token_accuracy": 0.7544987201690674, "num_tokens": 23279071.0, "step": 4235, "train/ce_loss": 0.43500611186027527 }, { "epoch": 0.41872651769824004, "step": 4235, "train/sim_loss": 0.03125 }, { "epoch": 0.41872651769824004, "step": 4235, "train/total_loss": 0.074750617146492 }, { "entropy": 8.847186088562012, "epoch": 0.4188253905477556, "mean_token_accuracy": 0.7118450999259949, "num_tokens": 23284591.0, "step": 4236, "train/ce_loss": 1.4223542213439941 }, { "epoch": 0.4188253905477556, "step": 4236, "train/sim_loss": 0.08203125 }, { "epoch": 0.4188253905477556, "step": 4236, "train/total_loss": 0.2242666780948639 }, { "entropy": 8.583212852478027, "epoch": 0.4189242633972711, "mean_token_accuracy": 0.7435367107391357, "num_tokens": 23290191.0, "step": 4237, "train/ce_loss": 0.5025284290313721 }, { "epoch": 0.4189242633972711, "step": 4237, "train/sim_loss": 0.03125 }, { "epoch": 0.4189242633972711, "step": 4237, "train/total_loss": 0.08150283992290497 }, { "entropy": 8.940250396728516, "epoch": 0.4190231362467866, "mean_token_accuracy": 0.7228327393531799, "num_tokens": 23295586.0, "step": 4238, "train/ce_loss": 1.2805254459381104 }, { "epoch": 0.4190231362467866, "step": 4238, "train/sim_loss": 0.0703125 }, { "epoch": 0.4190231362467866, "step": 4238, "train/total_loss": 0.19836504757404327 }, { "entropy": 8.879223823547363, "epoch": 0.41912200909630215, "mean_token_accuracy": 0.7547649145126343, "num_tokens": 23301037.0, "step": 4239, "train/ce_loss": 0.8774985074996948 }, { "epoch": 0.41912200909630215, "step": 4239, "train/sim_loss": 0.08984375 }, { "epoch": 0.41912200909630215, "step": 4239, "train/total_loss": 0.17759360373020172 }, { "epoch": 0.4192208819458177, "grad_norm": 0.8741592764854431, "learning_rate": 8.954408346931712e-06, "loss": 0.1378, "step": 4240 }, { "entropy": 8.93989372253418, "epoch": 0.4192208819458177, "mean_token_accuracy": 0.7620689868927002, "num_tokens": 23306419.0, "step": 4240, "train/ce_loss": 1.1524932384490967 }, { "epoch": 0.4192208819458177, "step": 4240, "train/sim_loss": 0.05859375 }, { "epoch": 0.4192208819458177, "step": 4240, "train/total_loss": 0.17384308576583862 }, { "entropy": 8.588336944580078, "epoch": 0.4193197547953332, "mean_token_accuracy": 0.7358997464179993, "num_tokens": 23312077.0, "step": 4241, "train/ce_loss": 0.5935593843460083 }, { "epoch": 0.4193197547953332, "step": 4241, "train/sim_loss": 0.11328125 }, { "epoch": 0.4193197547953332, "step": 4241, "train/total_loss": 0.1726371943950653 }, { "entropy": 9.04145622253418, "epoch": 0.4194186276448487, "mean_token_accuracy": 0.6867470145225525, "num_tokens": 23317460.0, "step": 4242, "train/ce_loss": 1.037589430809021 }, { "epoch": 0.4194186276448487, "step": 4242, "train/sim_loss": 0.08984375 }, { "epoch": 0.4194186276448487, "step": 4242, "train/total_loss": 0.19360269606113434 }, { "entropy": 8.870402336120605, "epoch": 0.41951750049436426, "mean_token_accuracy": 0.7776298522949219, "num_tokens": 23322800.0, "step": 4243, "train/ce_loss": 0.546425998210907 }, { "epoch": 0.41951750049436426, "step": 4243, "train/sim_loss": 0.0625 }, { "epoch": 0.41951750049436426, "step": 4243, "train/total_loss": 0.11714260280132294 }, { "entropy": 8.772353172302246, "epoch": 0.41961637334387974, "mean_token_accuracy": 0.7482014298439026, "num_tokens": 23328244.0, "step": 4244, "train/ce_loss": 0.696922779083252 }, { "epoch": 0.41961637334387974, "step": 4244, "train/sim_loss": 0.1015625 }, { "epoch": 0.41961637334387974, "step": 4244, "train/total_loss": 0.17125478386878967 }, { "entropy": 8.754507064819336, "epoch": 0.4197152461933953, "mean_token_accuracy": 0.7648376226425171, "num_tokens": 23333713.0, "step": 4245, "train/ce_loss": 0.9002295136451721 }, { "epoch": 0.4197152461933953, "step": 4245, "train/sim_loss": 0.04296875 }, { "epoch": 0.4197152461933953, "step": 4245, "train/total_loss": 0.1329917013645172 }, { "entropy": 8.766524314880371, "epoch": 0.4198141190429108, "mean_token_accuracy": 0.7592997550964355, "num_tokens": 23339228.0, "step": 4246, "train/ce_loss": 0.8590804934501648 }, { "epoch": 0.4198141190429108, "step": 4246, "train/sim_loss": 0.0625 }, { "epoch": 0.4198141190429108, "step": 4246, "train/total_loss": 0.14840805530548096 }, { "entropy": 8.83462142944336, "epoch": 0.4199129918924263, "mean_token_accuracy": 0.6970760226249695, "num_tokens": 23344724.0, "step": 4247, "train/ce_loss": 0.5798290371894836 }, { "epoch": 0.4199129918924263, "step": 4247, "train/sim_loss": 0.046875 }, { "epoch": 0.4199129918924263, "step": 4247, "train/total_loss": 0.1048579066991806 }, { "entropy": 8.50704574584961, "epoch": 0.42001186474194185, "mean_token_accuracy": 0.8105065822601318, "num_tokens": 23350356.0, "step": 4248, "train/ce_loss": 0.4264954924583435 }, { "epoch": 0.42001186474194185, "step": 4248, "train/sim_loss": 0.01953125 }, { "epoch": 0.42001186474194185, "step": 4248, "train/total_loss": 0.06218079850077629 }, { "entropy": 8.900346755981445, "epoch": 0.4201107375914574, "mean_token_accuracy": 0.7315068244934082, "num_tokens": 23355641.0, "step": 4249, "train/ce_loss": 1.2955416440963745 }, { "epoch": 0.4201107375914574, "step": 4249, "train/sim_loss": 0.0625 }, { "epoch": 0.4201107375914574, "step": 4249, "train/total_loss": 0.1920541673898697 }, { "entropy": 8.670159339904785, "epoch": 0.42020961044097294, "mean_token_accuracy": 0.6650987863540649, "num_tokens": 23361322.0, "step": 4250, "train/ce_loss": 0.9729334712028503 }, { "epoch": 0.42020961044097294, "step": 4250, "train/sim_loss": 0.0625 }, { "epoch": 0.42020961044097294, "step": 4250, "train/total_loss": 0.15979334712028503 }, { "entropy": 8.836281776428223, "epoch": 0.4203084832904884, "mean_token_accuracy": 0.8108108043670654, "num_tokens": 23366784.0, "step": 4251, "train/ce_loss": 0.5651025772094727 }, { "epoch": 0.4203084832904884, "step": 4251, "train/sim_loss": 0.015625 }, { "epoch": 0.4203084832904884, "step": 4251, "train/total_loss": 0.07213525474071503 }, { "entropy": 8.955548286437988, "epoch": 0.42040735614000396, "mean_token_accuracy": 0.7476635575294495, "num_tokens": 23372205.0, "step": 4252, "train/ce_loss": 0.49266964197158813 }, { "epoch": 0.42040735614000396, "step": 4252, "train/sim_loss": 0.1015625 }, { "epoch": 0.42040735614000396, "step": 4252, "train/total_loss": 0.1508294641971588 }, { "entropy": 9.025362014770508, "epoch": 0.4205062289895195, "mean_token_accuracy": 0.7383592128753662, "num_tokens": 23377771.0, "step": 4253, "train/ce_loss": 0.9049065709114075 }, { "epoch": 0.4205062289895195, "step": 4253, "train/sim_loss": 0.046875 }, { "epoch": 0.4205062289895195, "step": 4253, "train/total_loss": 0.1373656690120697 }, { "entropy": 8.641603469848633, "epoch": 0.420605101839035, "mean_token_accuracy": 0.7947643995285034, "num_tokens": 23383385.0, "step": 4254, "train/ce_loss": 0.772255003452301 }, { "epoch": 0.420605101839035, "step": 4254, "train/sim_loss": 0.02734375 }, { "epoch": 0.420605101839035, "step": 4254, "train/total_loss": 0.10456924885511398 }, { "entropy": 8.999517440795898, "epoch": 0.42070397468855053, "mean_token_accuracy": 0.7612826824188232, "num_tokens": 23388833.0, "step": 4255, "train/ce_loss": 0.6391456723213196 }, { "epoch": 0.42070397468855053, "step": 4255, "train/sim_loss": 0.0390625 }, { "epoch": 0.42070397468855053, "step": 4255, "train/total_loss": 0.10297706723213196 }, { "entropy": 8.84532356262207, "epoch": 0.42080284753806607, "mean_token_accuracy": 0.7303754091262817, "num_tokens": 23394280.0, "step": 4256, "train/ce_loss": 0.738776445388794 }, { "epoch": 0.42080284753806607, "step": 4256, "train/sim_loss": 0.0546875 }, { "epoch": 0.42080284753806607, "step": 4256, "train/total_loss": 0.12856514751911163 }, { "entropy": 9.206640243530273, "epoch": 0.42090172038758156, "mean_token_accuracy": 0.7590206265449524, "num_tokens": 23399681.0, "step": 4257, "train/ce_loss": 0.5536872744560242 }, { "epoch": 0.42090172038758156, "step": 4257, "train/sim_loss": 0.0234375 }, { "epoch": 0.42090172038758156, "step": 4257, "train/total_loss": 0.07880622893571854 }, { "entropy": 9.033958435058594, "epoch": 0.4210005932370971, "mean_token_accuracy": 0.783446729183197, "num_tokens": 23405215.0, "step": 4258, "train/ce_loss": 0.372598797082901 }, { "epoch": 0.4210005932370971, "step": 4258, "train/sim_loss": 0.125 }, { "epoch": 0.4210005932370971, "step": 4258, "train/total_loss": 0.16225987672805786 }, { "entropy": 9.130728721618652, "epoch": 0.42109946608661264, "mean_token_accuracy": 0.7798277735710144, "num_tokens": 23410649.0, "step": 4259, "train/ce_loss": 0.7234640717506409 }, { "epoch": 0.42109946608661264, "step": 4259, "train/sim_loss": 0.05859375 }, { "epoch": 0.42109946608661264, "step": 4259, "train/total_loss": 0.13094016909599304 }, { "epoch": 0.4211983389361281, "grad_norm": 0.6883321404457092, "learning_rate": 8.949463482173763e-06, "loss": 0.1384, "step": 4260 }, { "entropy": 8.95981216430664, "epoch": 0.4211983389361281, "mean_token_accuracy": 0.8031212687492371, "num_tokens": 23416085.0, "step": 4260, "train/ce_loss": 0.3725045919418335 }, { "epoch": 0.4211983389361281, "step": 4260, "train/sim_loss": 0.078125 }, { "epoch": 0.4211983389361281, "step": 4260, "train/total_loss": 0.11537545919418335 }, { "entropy": 8.692180633544922, "epoch": 0.42129721178564367, "mean_token_accuracy": 0.7737321257591248, "num_tokens": 23421400.0, "step": 4261, "train/ce_loss": 0.9329214096069336 }, { "epoch": 0.42129721178564367, "step": 4261, "train/sim_loss": 0.11328125 }, { "epoch": 0.42129721178564367, "step": 4261, "train/total_loss": 0.20657339692115784 }, { "entropy": 9.15539836883545, "epoch": 0.4213960846351592, "mean_token_accuracy": 0.7135207653045654, "num_tokens": 23426784.0, "step": 4262, "train/ce_loss": 0.6744425892829895 }, { "epoch": 0.4213960846351592, "step": 4262, "train/sim_loss": 0.0546875 }, { "epoch": 0.4213960846351592, "step": 4262, "train/total_loss": 0.12213175743818283 }, { "entropy": 9.02420425415039, "epoch": 0.4214949574846747, "mean_token_accuracy": 0.7428910136222839, "num_tokens": 23432221.0, "step": 4263, "train/ce_loss": 0.8161007761955261 }, { "epoch": 0.4214949574846747, "step": 4263, "train/sim_loss": 0.0546875 }, { "epoch": 0.4214949574846747, "step": 4263, "train/total_loss": 0.1362975835800171 }, { "entropy": 9.055652618408203, "epoch": 0.42159383033419023, "mean_token_accuracy": 0.7721088528633118, "num_tokens": 23437641.0, "step": 4264, "train/ce_loss": 0.48177093267440796 }, { "epoch": 0.42159383033419023, "step": 4264, "train/sim_loss": 0.0234375 }, { "epoch": 0.42159383033419023, "step": 4264, "train/total_loss": 0.0716145932674408 }, { "entropy": 8.918096542358398, "epoch": 0.4216927031837058, "mean_token_accuracy": 0.7424242496490479, "num_tokens": 23443100.0, "step": 4265, "train/ce_loss": 0.5448325872421265 }, { "epoch": 0.4216927031837058, "step": 4265, "train/sim_loss": 0.03515625 }, { "epoch": 0.4216927031837058, "step": 4265, "train/total_loss": 0.08963951468467712 }, { "entropy": 8.963521003723145, "epoch": 0.42179157603322126, "mean_token_accuracy": 0.7779005765914917, "num_tokens": 23448592.0, "step": 4266, "train/ce_loss": 0.43649667501449585 }, { "epoch": 0.42179157603322126, "step": 4266, "train/sim_loss": 0.0234375 }, { "epoch": 0.42179157603322126, "step": 4266, "train/total_loss": 0.06708717346191406 }, { "entropy": 9.049753189086914, "epoch": 0.4218904488827368, "mean_token_accuracy": 0.7094117403030396, "num_tokens": 23454027.0, "step": 4267, "train/ce_loss": 0.43366262316703796 }, { "epoch": 0.4218904488827368, "step": 4267, "train/sim_loss": 0.015625 }, { "epoch": 0.4218904488827368, "step": 4267, "train/total_loss": 0.058991264551877975 }, { "entropy": 9.180304527282715, "epoch": 0.42198932173225234, "mean_token_accuracy": 0.7243589758872986, "num_tokens": 23459420.0, "step": 4268, "train/ce_loss": 0.7852020859718323 }, { "epoch": 0.42198932173225234, "step": 4268, "train/sim_loss": 0.078125 }, { "epoch": 0.42198932173225234, "step": 4268, "train/total_loss": 0.15664520859718323 }, { "entropy": 8.912206649780273, "epoch": 0.42208819458176783, "mean_token_accuracy": 0.7379553318023682, "num_tokens": 23464876.0, "step": 4269, "train/ce_loss": 0.8618438243865967 }, { "epoch": 0.42208819458176783, "step": 4269, "train/sim_loss": 0.01953125 }, { "epoch": 0.42208819458176783, "step": 4269, "train/total_loss": 0.10571563243865967 }, { "entropy": 8.543493270874023, "epoch": 0.42218706743128337, "mean_token_accuracy": 0.743755042552948, "num_tokens": 23470661.0, "step": 4270, "train/ce_loss": 0.2796454429626465 }, { "epoch": 0.42218706743128337, "step": 4270, "train/sim_loss": 0.046875 }, { "epoch": 0.42218706743128337, "step": 4270, "train/total_loss": 0.07483954727649689 }, { "entropy": 9.15056037902832, "epoch": 0.4222859402807989, "mean_token_accuracy": 0.7130647301673889, "num_tokens": 23476161.0, "step": 4271, "train/ce_loss": 0.5821291208267212 }, { "epoch": 0.4222859402807989, "step": 4271, "train/sim_loss": 0.046875 }, { "epoch": 0.4222859402807989, "step": 4271, "train/total_loss": 0.10508791357278824 }, { "entropy": 8.715227127075195, "epoch": 0.4223848131303144, "mean_token_accuracy": 0.6961206793785095, "num_tokens": 23481757.0, "step": 4272, "train/ce_loss": 1.4134061336517334 }, { "epoch": 0.4223848131303144, "step": 4272, "train/sim_loss": 0.11328125 }, { "epoch": 0.4223848131303144, "step": 4272, "train/total_loss": 0.25462186336517334 }, { "entropy": 9.279730796813965, "epoch": 0.42248368597982994, "mean_token_accuracy": 0.7371202111244202, "num_tokens": 23486986.0, "step": 4273, "train/ce_loss": 0.9436401128768921 }, { "epoch": 0.42248368597982994, "step": 4273, "train/sim_loss": 0.04296875 }, { "epoch": 0.42248368597982994, "step": 4273, "train/total_loss": 0.1373327672481537 }, { "entropy": 8.82248306274414, "epoch": 0.4225825588293455, "mean_token_accuracy": 0.7494577169418335, "num_tokens": 23492473.0, "step": 4274, "train/ce_loss": 0.5347586870193481 }, { "epoch": 0.4225825588293455, "step": 4274, "train/sim_loss": 0.11328125 }, { "epoch": 0.4225825588293455, "step": 4274, "train/total_loss": 0.16675712168216705 }, { "entropy": 8.766021728515625, "epoch": 0.42268143167886096, "mean_token_accuracy": 0.7116504907608032, "num_tokens": 23498125.0, "step": 4275, "train/ce_loss": 0.8976816534996033 }, { "epoch": 0.42268143167886096, "step": 4275, "train/sim_loss": 0.078125 }, { "epoch": 0.42268143167886096, "step": 4275, "train/total_loss": 0.1678931713104248 }, { "entropy": 9.019716262817383, "epoch": 0.4227803045283765, "mean_token_accuracy": 0.6922155618667603, "num_tokens": 23503616.0, "step": 4276, "train/ce_loss": 0.5765544772148132 }, { "epoch": 0.4227803045283765, "step": 4276, "train/sim_loss": 0.0703125 }, { "epoch": 0.4227803045283765, "step": 4276, "train/total_loss": 0.1279679536819458 }, { "entropy": 8.957992553710938, "epoch": 0.42287917737789205, "mean_token_accuracy": 0.7089783549308777, "num_tokens": 23509215.0, "step": 4277, "train/ce_loss": 0.5640846490859985 }, { "epoch": 0.42287917737789205, "step": 4277, "train/sim_loss": 0.08984375 }, { "epoch": 0.42287917737789205, "step": 4277, "train/total_loss": 0.14625221490859985 }, { "entropy": 8.707660675048828, "epoch": 0.42297805022740753, "mean_token_accuracy": 0.7783074975013733, "num_tokens": 23514710.0, "step": 4278, "train/ce_loss": 0.713432252407074 }, { "epoch": 0.42297805022740753, "step": 4278, "train/sim_loss": 0.0234375 }, { "epoch": 0.42297805022740753, "step": 4278, "train/total_loss": 0.09478072822093964 }, { "entropy": 8.69637680053711, "epoch": 0.4230769230769231, "mean_token_accuracy": 0.7873620986938477, "num_tokens": 23520319.0, "step": 4279, "train/ce_loss": 0.8120812773704529 }, { "epoch": 0.4230769230769231, "step": 4279, "train/sim_loss": 0.03125 }, { "epoch": 0.4230769230769231, "step": 4279, "train/total_loss": 0.11245813220739365 }, { "epoch": 0.4231757959264386, "grad_norm": 0.680225670337677, "learning_rate": 8.944518617415815e-06, "loss": 0.1435, "step": 4280 }, { "entropy": 8.675450325012207, "epoch": 0.4231757959264386, "mean_token_accuracy": 0.7681307196617126, "num_tokens": 23526081.0, "step": 4280, "train/ce_loss": 0.7309013605117798 }, { "epoch": 0.4231757959264386, "step": 4280, "train/sim_loss": 0.1015625 }, { "epoch": 0.4231757959264386, "step": 4280, "train/total_loss": 0.17465263605117798 }, { "entropy": 8.54493522644043, "epoch": 0.4232746687759541, "mean_token_accuracy": 0.704692542552948, "num_tokens": 23531862.0, "step": 4281, "train/ce_loss": 2.1578664779663086 }, { "epoch": 0.4232746687759541, "step": 4281, "train/sim_loss": 0.06640625 }, { "epoch": 0.4232746687759541, "step": 4281, "train/total_loss": 0.2821928858757019 }, { "entropy": 8.956356048583984, "epoch": 0.42337354162546964, "mean_token_accuracy": 0.7368420958518982, "num_tokens": 23537380.0, "step": 4282, "train/ce_loss": 0.7334954738616943 }, { "epoch": 0.42337354162546964, "step": 4282, "train/sim_loss": 0.046875 }, { "epoch": 0.42337354162546964, "step": 4282, "train/total_loss": 0.12022455036640167 }, { "entropy": 8.865817070007324, "epoch": 0.4234724144749852, "mean_token_accuracy": 0.8062360882759094, "num_tokens": 23542850.0, "step": 4283, "train/ce_loss": 0.3446737825870514 }, { "epoch": 0.4234724144749852, "step": 4283, "train/sim_loss": 0.03515625 }, { "epoch": 0.4234724144749852, "step": 4283, "train/total_loss": 0.06962363421916962 }, { "entropy": 9.08142375946045, "epoch": 0.42357128732450067, "mean_token_accuracy": 0.7563451528549194, "num_tokens": 23548257.0, "step": 4284, "train/ce_loss": 0.7085339426994324 }, { "epoch": 0.42357128732450067, "step": 4284, "train/sim_loss": 0.08984375 }, { "epoch": 0.42357128732450067, "step": 4284, "train/total_loss": 0.16069714725017548 }, { "entropy": 8.929805755615234, "epoch": 0.4236701601740162, "mean_token_accuracy": 0.7020089030265808, "num_tokens": 23553675.0, "step": 4285, "train/ce_loss": 1.5199469327926636 }, { "epoch": 0.4236701601740162, "step": 4285, "train/sim_loss": 0.0625 }, { "epoch": 0.4236701601740162, "step": 4285, "train/total_loss": 0.21449469029903412 }, { "entropy": 8.81942081451416, "epoch": 0.42376903302353175, "mean_token_accuracy": 0.7435294389724731, "num_tokens": 23559160.0, "step": 4286, "train/ce_loss": 1.009212613105774 }, { "epoch": 0.42376903302353175, "step": 4286, "train/sim_loss": 0.07421875 }, { "epoch": 0.42376903302353175, "step": 4286, "train/total_loss": 0.17514002323150635 }, { "entropy": 8.563249588012695, "epoch": 0.42386790587304723, "mean_token_accuracy": 0.6519823670387268, "num_tokens": 23564850.0, "step": 4287, "train/ce_loss": 0.8049797415733337 }, { "epoch": 0.42386790587304723, "step": 4287, "train/sim_loss": 0.1015625 }, { "epoch": 0.42386790587304723, "step": 4287, "train/total_loss": 0.18206048011779785 }, { "entropy": 8.717697143554688, "epoch": 0.4239667787225628, "mean_token_accuracy": 0.7540515065193176, "num_tokens": 23570567.0, "step": 4288, "train/ce_loss": 0.4020925760269165 }, { "epoch": 0.4239667787225628, "step": 4288, "train/sim_loss": 0.0234375 }, { "epoch": 0.4239667787225628, "step": 4288, "train/total_loss": 0.06364676356315613 }, { "entropy": 8.943885803222656, "epoch": 0.4240656515720783, "mean_token_accuracy": 0.7508730888366699, "num_tokens": 23576005.0, "step": 4289, "train/ce_loss": 0.7887842059135437 }, { "epoch": 0.4240656515720783, "step": 4289, "train/sim_loss": 0.07421875 }, { "epoch": 0.4240656515720783, "step": 4289, "train/total_loss": 0.15309718251228333 }, { "entropy": 8.958918571472168, "epoch": 0.4241645244215938, "mean_token_accuracy": 0.6983931064605713, "num_tokens": 23581417.0, "step": 4290, "train/ce_loss": 1.0287266969680786 }, { "epoch": 0.4241645244215938, "step": 4290, "train/sim_loss": 0.09375 }, { "epoch": 0.4241645244215938, "step": 4290, "train/total_loss": 0.19662266969680786 }, { "entropy": 8.491270065307617, "epoch": 0.42426339727110934, "mean_token_accuracy": 0.6754863858222961, "num_tokens": 23587226.0, "step": 4291, "train/ce_loss": 1.5087435245513916 }, { "epoch": 0.42426339727110934, "step": 4291, "train/sim_loss": 0.0625 }, { "epoch": 0.42426339727110934, "step": 4291, "train/total_loss": 0.21337436139583588 }, { "entropy": 8.9766206741333, "epoch": 0.4243622701206249, "mean_token_accuracy": 0.776150643825531, "num_tokens": 23592826.0, "step": 4292, "train/ce_loss": 0.38587960600852966 }, { "epoch": 0.4243622701206249, "step": 4292, "train/sim_loss": 0.01953125 }, { "epoch": 0.4243622701206249, "step": 4292, "train/total_loss": 0.058119211345911026 }, { "entropy": 8.957941055297852, "epoch": 0.4244611429701404, "mean_token_accuracy": 0.7895377278327942, "num_tokens": 23598266.0, "step": 4293, "train/ce_loss": 0.36605384945869446 }, { "epoch": 0.4244611429701404, "step": 4293, "train/sim_loss": 0.015625 }, { "epoch": 0.4244611429701404, "step": 4293, "train/total_loss": 0.052230384200811386 }, { "entropy": 8.68549919128418, "epoch": 0.4245600158196559, "mean_token_accuracy": 0.7795698642730713, "num_tokens": 23603862.0, "step": 4294, "train/ce_loss": 0.6743130087852478 }, { "epoch": 0.4245600158196559, "step": 4294, "train/sim_loss": 0.078125 }, { "epoch": 0.4245600158196559, "step": 4294, "train/total_loss": 0.14555630087852478 }, { "entropy": 9.118549346923828, "epoch": 0.42465888866917145, "mean_token_accuracy": 0.7353689670562744, "num_tokens": 23609198.0, "step": 4295, "train/ce_loss": 1.0420154333114624 }, { "epoch": 0.42465888866917145, "step": 4295, "train/sim_loss": 0.046875 }, { "epoch": 0.42465888866917145, "step": 4295, "train/total_loss": 0.1510765552520752 }, { "entropy": 9.111371040344238, "epoch": 0.424757761518687, "mean_token_accuracy": 0.7362250685691833, "num_tokens": 23614654.0, "step": 4296, "train/ce_loss": 0.9063346982002258 }, { "epoch": 0.424757761518687, "step": 4296, "train/sim_loss": 0.05078125 }, { "epoch": 0.424757761518687, "step": 4296, "train/total_loss": 0.14141473174095154 }, { "entropy": 8.720935821533203, "epoch": 0.4248566343682025, "mean_token_accuracy": 0.7440147399902344, "num_tokens": 23620312.0, "step": 4297, "train/ce_loss": 1.5217219591140747 }, { "epoch": 0.4248566343682025, "step": 4297, "train/sim_loss": 0.05859375 }, { "epoch": 0.4248566343682025, "step": 4297, "train/total_loss": 0.21076594293117523 }, { "entropy": 8.59457015991211, "epoch": 0.424955507217718, "mean_token_accuracy": 0.738269031047821, "num_tokens": 23625945.0, "step": 4298, "train/ce_loss": 1.0574226379394531 }, { "epoch": 0.424955507217718, "step": 4298, "train/sim_loss": 0.0546875 }, { "epoch": 0.424955507217718, "step": 4298, "train/total_loss": 0.16042977571487427 }, { "entropy": 9.316519737243652, "epoch": 0.42505438006723356, "mean_token_accuracy": 0.7071078419685364, "num_tokens": 23631408.0, "step": 4299, "train/ce_loss": 0.8681966662406921 }, { "epoch": 0.42505438006723356, "step": 4299, "train/sim_loss": 0.1171875 }, { "epoch": 0.42505438006723356, "step": 4299, "train/total_loss": 0.20400717854499817 }, { "epoch": 0.42515325291674905, "grad_norm": 0.9127551317214966, "learning_rate": 8.939573752657866e-06, "loss": 0.1479, "step": 4300 }, { "entropy": 9.33104133605957, "epoch": 0.42515325291674905, "mean_token_accuracy": 0.7551020383834839, "num_tokens": 23636684.0, "step": 4300, "train/ce_loss": 0.818688690662384 }, { "epoch": 0.42515325291674905, "step": 4300, "train/sim_loss": 0.04296875 }, { "epoch": 0.42515325291674905, "step": 4300, "train/total_loss": 0.12483762204647064 }, { "entropy": 9.270234107971191, "epoch": 0.4252521257662646, "mean_token_accuracy": 0.7390146255493164, "num_tokens": 23642005.0, "step": 4301, "train/ce_loss": 0.5039335489273071 }, { "epoch": 0.4252521257662646, "step": 4301, "train/sim_loss": 0.06640625 }, { "epoch": 0.4252521257662646, "step": 4301, "train/total_loss": 0.11679960787296295 }, { "entropy": 8.752325057983398, "epoch": 0.42535099861578013, "mean_token_accuracy": 0.8226163983345032, "num_tokens": 23647591.0, "step": 4302, "train/ce_loss": 0.4678003489971161 }, { "epoch": 0.42535099861578013, "step": 4302, "train/sim_loss": 0.02734375 }, { "epoch": 0.42535099861578013, "step": 4302, "train/total_loss": 0.07412378489971161 }, { "entropy": 8.9248685836792, "epoch": 0.4254498714652956, "mean_token_accuracy": 0.6916395425796509, "num_tokens": 23653121.0, "step": 4303, "train/ce_loss": 1.3367574214935303 }, { "epoch": 0.4254498714652956, "step": 4303, "train/sim_loss": 0.07421875 }, { "epoch": 0.4254498714652956, "step": 4303, "train/total_loss": 0.2078944891691208 }, { "entropy": 8.956079483032227, "epoch": 0.42554874431481116, "mean_token_accuracy": 0.7392739057540894, "num_tokens": 23658682.0, "step": 4304, "train/ce_loss": 0.628112256526947 }, { "epoch": 0.42554874431481116, "step": 4304, "train/sim_loss": 0.0546875 }, { "epoch": 0.42554874431481116, "step": 4304, "train/total_loss": 0.1174987256526947 }, { "entropy": 9.062653541564941, "epoch": 0.4256476171643267, "mean_token_accuracy": 0.7862069010734558, "num_tokens": 23664137.0, "step": 4305, "train/ce_loss": 0.7313695549964905 }, { "epoch": 0.4256476171643267, "step": 4305, "train/sim_loss": 0.0625 }, { "epoch": 0.4256476171643267, "step": 4305, "train/total_loss": 0.13563695549964905 }, { "entropy": 8.88763427734375, "epoch": 0.4257464900138422, "mean_token_accuracy": 0.7406483888626099, "num_tokens": 23669519.0, "step": 4306, "train/ce_loss": 0.7918031215667725 }, { "epoch": 0.4257464900138422, "step": 4306, "train/sim_loss": 0.0234375 }, { "epoch": 0.4257464900138422, "step": 4306, "train/total_loss": 0.10261781513690948 }, { "entropy": 9.183318138122559, "epoch": 0.4258453628633577, "mean_token_accuracy": 0.7728459239006042, "num_tokens": 23674896.0, "step": 4307, "train/ce_loss": 0.5131075978279114 }, { "epoch": 0.4258453628633577, "step": 4307, "train/sim_loss": 0.05078125 }, { "epoch": 0.4258453628633577, "step": 4307, "train/total_loss": 0.10209201276302338 }, { "entropy": 9.149539947509766, "epoch": 0.42594423571287326, "mean_token_accuracy": 0.7451253533363342, "num_tokens": 23680164.0, "step": 4308, "train/ce_loss": 0.6851814985275269 }, { "epoch": 0.42594423571287326, "step": 4308, "train/sim_loss": 0.0703125 }, { "epoch": 0.42594423571287326, "step": 4308, "train/total_loss": 0.13883066177368164 }, { "entropy": 8.82754135131836, "epoch": 0.42604310856238875, "mean_token_accuracy": 0.6998050808906555, "num_tokens": 23685782.0, "step": 4309, "train/ce_loss": 0.916694164276123 }, { "epoch": 0.42604310856238875, "step": 4309, "train/sim_loss": 0.06640625 }, { "epoch": 0.42604310856238875, "step": 4309, "train/total_loss": 0.15807566046714783 }, { "entropy": 8.985616683959961, "epoch": 0.4261419814119043, "mean_token_accuracy": 0.8090507984161377, "num_tokens": 23691295.0, "step": 4310, "train/ce_loss": 0.5450083017349243 }, { "epoch": 0.4261419814119043, "step": 4310, "train/sim_loss": 0.046875 }, { "epoch": 0.4261419814119043, "step": 4310, "train/total_loss": 0.10137583315372467 }, { "entropy": 9.172715187072754, "epoch": 0.42624085426141983, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 23696554.0, "step": 4311, "train/ce_loss": 1.2602288722991943 }, { "epoch": 0.42624085426141983, "step": 4311, "train/sim_loss": 0.046875 }, { "epoch": 0.42624085426141983, "step": 4311, "train/total_loss": 0.17289789021015167 }, { "entropy": 8.835046768188477, "epoch": 0.4263397271109353, "mean_token_accuracy": 0.73893803358078, "num_tokens": 23702056.0, "step": 4312, "train/ce_loss": 1.245605707168579 }, { "epoch": 0.4263397271109353, "step": 4312, "train/sim_loss": 0.0625 }, { "epoch": 0.4263397271109353, "step": 4312, "train/total_loss": 0.18706056475639343 }, { "entropy": 8.76102066040039, "epoch": 0.42643859996045086, "mean_token_accuracy": 0.7563959956169128, "num_tokens": 23707616.0, "step": 4313, "train/ce_loss": 0.6886029243469238 }, { "epoch": 0.42643859996045086, "step": 4313, "train/sim_loss": 0.06640625 }, { "epoch": 0.42643859996045086, "step": 4313, "train/total_loss": 0.13526654243469238 }, { "entropy": 9.162178039550781, "epoch": 0.4265374728099664, "mean_token_accuracy": 0.7969821691513062, "num_tokens": 23713005.0, "step": 4314, "train/ce_loss": 0.9298944473266602 }, { "epoch": 0.4265374728099664, "step": 4314, "train/sim_loss": 0.04296875 }, { "epoch": 0.4265374728099664, "step": 4314, "train/total_loss": 0.13595819473266602 }, { "entropy": 8.916435241699219, "epoch": 0.4266363456594819, "mean_token_accuracy": 0.7487499713897705, "num_tokens": 23718347.0, "step": 4315, "train/ce_loss": 0.7563055157661438 }, { "epoch": 0.4266363456594819, "step": 4315, "train/sim_loss": 0.0390625 }, { "epoch": 0.4266363456594819, "step": 4315, "train/total_loss": 0.1146930530667305 }, { "entropy": 8.955451965332031, "epoch": 0.4267352185089974, "mean_token_accuracy": 0.7335811853408813, "num_tokens": 23723750.0, "step": 4316, "train/ce_loss": 0.813073456287384 }, { "epoch": 0.4267352185089974, "step": 4316, "train/sim_loss": 0.05078125 }, { "epoch": 0.4267352185089974, "step": 4316, "train/total_loss": 0.13208860158920288 }, { "entropy": 8.89845085144043, "epoch": 0.42683409135851297, "mean_token_accuracy": 0.7385475039482117, "num_tokens": 23729447.0, "step": 4317, "train/ce_loss": 0.4689614772796631 }, { "epoch": 0.42683409135851297, "step": 4317, "train/sim_loss": 0.10546875 }, { "epoch": 0.42683409135851297, "step": 4317, "train/total_loss": 0.15236489474773407 }, { "entropy": 8.978468894958496, "epoch": 0.42693296420802845, "mean_token_accuracy": 0.6967821717262268, "num_tokens": 23734867.0, "step": 4318, "train/ce_loss": 1.3115153312683105 }, { "epoch": 0.42693296420802845, "step": 4318, "train/sim_loss": 0.07421875 }, { "epoch": 0.42693296420802845, "step": 4318, "train/total_loss": 0.20537029206752777 }, { "entropy": 9.007421493530273, "epoch": 0.427031837057544, "mean_token_accuracy": 0.7030237317085266, "num_tokens": 23740463.0, "step": 4319, "train/ce_loss": 0.5465189218521118 }, { "epoch": 0.427031837057544, "step": 4319, "train/sim_loss": 0.0703125 }, { "epoch": 0.427031837057544, "step": 4319, "train/total_loss": 0.1249643936753273 }, { "epoch": 0.42713070990705954, "grad_norm": 0.7409549355506897, "learning_rate": 8.934628887899916e-06, "loss": 0.136, "step": 4320 }, { "entropy": 9.181217193603516, "epoch": 0.42713070990705954, "mean_token_accuracy": 0.7249666452407837, "num_tokens": 23745771.0, "step": 4320, "train/ce_loss": 1.0756255388259888 }, { "epoch": 0.42713070990705954, "step": 4320, "train/sim_loss": 0.0625 }, { "epoch": 0.42713070990705954, "step": 4320, "train/total_loss": 0.17006255686283112 }, { "entropy": 8.74440860748291, "epoch": 0.427229582756575, "mean_token_accuracy": 0.7173333168029785, "num_tokens": 23751174.0, "step": 4321, "train/ce_loss": 0.8402342200279236 }, { "epoch": 0.427229582756575, "step": 4321, "train/sim_loss": 0.046875 }, { "epoch": 0.427229582756575, "step": 4321, "train/total_loss": 0.13089841604232788 }, { "entropy": 8.93539810180664, "epoch": 0.42732845560609056, "mean_token_accuracy": 0.7932885885238647, "num_tokens": 23756562.0, "step": 4322, "train/ce_loss": 0.5285578370094299 }, { "epoch": 0.42732845560609056, "step": 4322, "train/sim_loss": 0.046875 }, { "epoch": 0.42732845560609056, "step": 4322, "train/total_loss": 0.09973078966140747 }, { "entropy": 8.955608367919922, "epoch": 0.4274273284556061, "mean_token_accuracy": 0.7233532667160034, "num_tokens": 23762099.0, "step": 4323, "train/ce_loss": 0.9355547428131104 }, { "epoch": 0.4274273284556061, "step": 4323, "train/sim_loss": 0.0546875 }, { "epoch": 0.4274273284556061, "step": 4323, "train/total_loss": 0.1482429802417755 }, { "entropy": 8.96577262878418, "epoch": 0.4275262013051216, "mean_token_accuracy": 0.7666302919387817, "num_tokens": 23767656.0, "step": 4324, "train/ce_loss": 0.8575612306594849 }, { "epoch": 0.4275262013051216, "step": 4324, "train/sim_loss": 0.1015625 }, { "epoch": 0.4275262013051216, "step": 4324, "train/total_loss": 0.1873186230659485 }, { "entropy": 8.902002334594727, "epoch": 0.42762507415463713, "mean_token_accuracy": 0.7259414196014404, "num_tokens": 23773187.0, "step": 4325, "train/ce_loss": 1.0520542860031128 }, { "epoch": 0.42762507415463713, "step": 4325, "train/sim_loss": 0.046875 }, { "epoch": 0.42762507415463713, "step": 4325, "train/total_loss": 0.15208043158054352 }, { "entropy": 8.598302841186523, "epoch": 0.42772394700415267, "mean_token_accuracy": 0.8122448921203613, "num_tokens": 23778849.0, "step": 4326, "train/ce_loss": 0.3427233099937439 }, { "epoch": 0.42772394700415267, "step": 4326, "train/sim_loss": 0.05859375 }, { "epoch": 0.42772394700415267, "step": 4326, "train/total_loss": 0.09286607801914215 }, { "entropy": 8.935439109802246, "epoch": 0.42782281985366816, "mean_token_accuracy": 0.7420118451118469, "num_tokens": 23784337.0, "step": 4327, "train/ce_loss": 0.7301582098007202 }, { "epoch": 0.42782281985366816, "step": 4327, "train/sim_loss": 0.0390625 }, { "epoch": 0.42782281985366816, "step": 4327, "train/total_loss": 0.11207832396030426 }, { "entropy": 8.616314888000488, "epoch": 0.4279216927031837, "mean_token_accuracy": 0.7640878558158875, "num_tokens": 23789993.0, "step": 4328, "train/ce_loss": 0.5628671646118164 }, { "epoch": 0.4279216927031837, "step": 4328, "train/sim_loss": 0.01953125 }, { "epoch": 0.4279216927031837, "step": 4328, "train/total_loss": 0.07581797242164612 }, { "entropy": 8.953514099121094, "epoch": 0.42802056555269924, "mean_token_accuracy": 0.7201783657073975, "num_tokens": 23795493.0, "step": 4329, "train/ce_loss": 0.6452338695526123 }, { "epoch": 0.42802056555269924, "step": 4329, "train/sim_loss": 0.05859375 }, { "epoch": 0.42802056555269924, "step": 4329, "train/total_loss": 0.12311714142560959 }, { "entropy": 9.081363677978516, "epoch": 0.4281194384022147, "mean_token_accuracy": 0.7553793787956238, "num_tokens": 23800914.0, "step": 4330, "train/ce_loss": 0.6653828620910645 }, { "epoch": 0.4281194384022147, "step": 4330, "train/sim_loss": 0.02734375 }, { "epoch": 0.4281194384022147, "step": 4330, "train/total_loss": 0.09388203918933868 }, { "entropy": 8.863423347473145, "epoch": 0.42821831125173027, "mean_token_accuracy": 0.7064732313156128, "num_tokens": 23806481.0, "step": 4331, "train/ce_loss": 1.822793960571289 }, { "epoch": 0.42821831125173027, "step": 4331, "train/sim_loss": 0.078125 }, { "epoch": 0.42821831125173027, "step": 4331, "train/total_loss": 0.26040440797805786 }, { "entropy": 8.97493839263916, "epoch": 0.4283171841012458, "mean_token_accuracy": 0.7490445971488953, "num_tokens": 23811918.0, "step": 4332, "train/ce_loss": 0.8429521322250366 }, { "epoch": 0.4283171841012458, "step": 4332, "train/sim_loss": 0.078125 }, { "epoch": 0.4283171841012458, "step": 4332, "train/total_loss": 0.16242021322250366 }, { "entropy": 8.924827575683594, "epoch": 0.42841605695076135, "mean_token_accuracy": 0.7364621162414551, "num_tokens": 23817204.0, "step": 4333, "train/ce_loss": 0.623823344707489 }, { "epoch": 0.42841605695076135, "step": 4333, "train/sim_loss": 0.078125 }, { "epoch": 0.42841605695076135, "step": 4333, "train/total_loss": 0.14050734043121338 }, { "entropy": 9.158557891845703, "epoch": 0.42851492980027683, "mean_token_accuracy": 0.7229219079017639, "num_tokens": 23822631.0, "step": 4334, "train/ce_loss": 1.2451270818710327 }, { "epoch": 0.42851492980027683, "step": 4334, "train/sim_loss": 0.05078125 }, { "epoch": 0.42851492980027683, "step": 4334, "train/total_loss": 0.1752939522266388 }, { "entropy": 8.827362060546875, "epoch": 0.4286138026497924, "mean_token_accuracy": 0.7436463832855225, "num_tokens": 23828154.0, "step": 4335, "train/ce_loss": 0.49334636330604553 }, { "epoch": 0.4286138026497924, "step": 4335, "train/sim_loss": 0.01953125 }, { "epoch": 0.4286138026497924, "step": 4335, "train/total_loss": 0.06886588782072067 }, { "entropy": 8.894794464111328, "epoch": 0.4287126754993079, "mean_token_accuracy": 0.7995618581771851, "num_tokens": 23833643.0, "step": 4336, "train/ce_loss": 0.7573347687721252 }, { "epoch": 0.4287126754993079, "step": 4336, "train/sim_loss": 0.11328125 }, { "epoch": 0.4287126754993079, "step": 4336, "train/total_loss": 0.189014732837677 }, { "entropy": 8.563762664794922, "epoch": 0.4288115483488234, "mean_token_accuracy": 0.6660268902778625, "num_tokens": 23839239.0, "step": 4337, "train/ce_loss": 1.4215240478515625 }, { "epoch": 0.4288115483488234, "step": 4337, "train/sim_loss": 0.08203125 }, { "epoch": 0.4288115483488234, "step": 4337, "train/total_loss": 0.22418366372585297 }, { "entropy": 8.878945350646973, "epoch": 0.42891042119833894, "mean_token_accuracy": 0.7744680643081665, "num_tokens": 23844973.0, "step": 4338, "train/ce_loss": 0.4488917887210846 }, { "epoch": 0.42891042119833894, "step": 4338, "train/sim_loss": 0.09375 }, { "epoch": 0.42891042119833894, "step": 4338, "train/total_loss": 0.1386391818523407 }, { "entropy": 9.113162994384766, "epoch": 0.4290092940478545, "mean_token_accuracy": 0.6946022510528564, "num_tokens": 23850308.0, "step": 4339, "train/ce_loss": 0.6030559539794922 }, { "epoch": 0.4290092940478545, "step": 4339, "train/sim_loss": 0.0703125 }, { "epoch": 0.4290092940478545, "step": 4339, "train/total_loss": 0.13061809539794922 }, { "epoch": 0.42910816689736997, "grad_norm": 0.9209092259407043, "learning_rate": 8.929684023141968e-06, "loss": 0.1486, "step": 4340 }, { "entropy": 8.913597106933594, "epoch": 0.42910816689736997, "mean_token_accuracy": 0.6677148938179016, "num_tokens": 23855910.0, "step": 4340, "train/ce_loss": 0.8679313063621521 }, { "epoch": 0.42910816689736997, "step": 4340, "train/sim_loss": 0.0390625 }, { "epoch": 0.42910816689736997, "step": 4340, "train/total_loss": 0.12585562467575073 }, { "entropy": 8.79263687133789, "epoch": 0.4292070397468855, "mean_token_accuracy": 0.746012270450592, "num_tokens": 23861404.0, "step": 4341, "train/ce_loss": 0.9954127073287964 }, { "epoch": 0.4292070397468855, "step": 4341, "train/sim_loss": 0.08203125 }, { "epoch": 0.4292070397468855, "step": 4341, "train/total_loss": 0.18157252669334412 }, { "entropy": 9.217334747314453, "epoch": 0.42930591259640105, "mean_token_accuracy": 0.70703125, "num_tokens": 23866814.0, "step": 4342, "train/ce_loss": 0.9141753315925598 }, { "epoch": 0.42930591259640105, "step": 4342, "train/sim_loss": 0.05078125 }, { "epoch": 0.42930591259640105, "step": 4342, "train/total_loss": 0.14219878613948822 }, { "entropy": 8.95736026763916, "epoch": 0.42940478544591654, "mean_token_accuracy": 0.7595238089561462, "num_tokens": 23872287.0, "step": 4343, "train/ce_loss": 0.8968206644058228 }, { "epoch": 0.42940478544591654, "step": 4343, "train/sim_loss": 0.0546875 }, { "epoch": 0.42940478544591654, "step": 4343, "train/total_loss": 0.14436957240104675 }, { "entropy": 8.712032318115234, "epoch": 0.4295036582954321, "mean_token_accuracy": 0.7584269642829895, "num_tokens": 23877770.0, "step": 4344, "train/ce_loss": 0.9462868571281433 }, { "epoch": 0.4295036582954321, "step": 4344, "train/sim_loss": 0.06640625 }, { "epoch": 0.4295036582954321, "step": 4344, "train/total_loss": 0.1610349416732788 }, { "entropy": 8.981897354125977, "epoch": 0.4296025311449476, "mean_token_accuracy": 0.7137637138366699, "num_tokens": 23883202.0, "step": 4345, "train/ce_loss": 0.7230494022369385 }, { "epoch": 0.4296025311449476, "step": 4345, "train/sim_loss": 0.1015625 }, { "epoch": 0.4296025311449476, "step": 4345, "train/total_loss": 0.17386743426322937 }, { "entropy": 9.13937759399414, "epoch": 0.4297014039944631, "mean_token_accuracy": 0.7059553265571594, "num_tokens": 23888656.0, "step": 4346, "train/ce_loss": 0.9597265124320984 }, { "epoch": 0.4297014039944631, "step": 4346, "train/sim_loss": 0.0625 }, { "epoch": 0.4297014039944631, "step": 4346, "train/total_loss": 0.15847265720367432 }, { "entropy": 9.259757995605469, "epoch": 0.42980027684397865, "mean_token_accuracy": 0.7066493034362793, "num_tokens": 23894056.0, "step": 4347, "train/ce_loss": 0.6120913624763489 }, { "epoch": 0.42980027684397865, "step": 4347, "train/sim_loss": 0.04296875 }, { "epoch": 0.42980027684397865, "step": 4347, "train/total_loss": 0.10417789220809937 }, { "entropy": 8.838827133178711, "epoch": 0.4298991496934942, "mean_token_accuracy": 0.793684184551239, "num_tokens": 23899659.0, "step": 4348, "train/ce_loss": 0.8979378938674927 }, { "epoch": 0.4298991496934942, "step": 4348, "train/sim_loss": 0.0625 }, { "epoch": 0.4298991496934942, "step": 4348, "train/total_loss": 0.15229380130767822 }, { "entropy": 9.17361831665039, "epoch": 0.4299980225430097, "mean_token_accuracy": 0.743107795715332, "num_tokens": 23905007.0, "step": 4349, "train/ce_loss": 0.9258038401603699 }, { "epoch": 0.4299980225430097, "step": 4349, "train/sim_loss": 0.08984375 }, { "epoch": 0.4299980225430097, "step": 4349, "train/total_loss": 0.1824241280555725 }, { "entropy": 9.239164352416992, "epoch": 0.4300968953925252, "mean_token_accuracy": 0.7047872543334961, "num_tokens": 23910404.0, "step": 4350, "train/ce_loss": 0.6834805011749268 }, { "epoch": 0.4300968953925252, "step": 4350, "train/sim_loss": 0.05078125 }, { "epoch": 0.4300968953925252, "step": 4350, "train/total_loss": 0.11912930011749268 }, { "entropy": 8.889925956726074, "epoch": 0.43019576824204075, "mean_token_accuracy": 0.7794928550720215, "num_tokens": 23915892.0, "step": 4351, "train/ce_loss": 0.4266245365142822 }, { "epoch": 0.43019576824204075, "step": 4351, "train/sim_loss": 0.015625 }, { "epoch": 0.43019576824204075, "step": 4351, "train/total_loss": 0.05828745290637016 }, { "entropy": 8.95865535736084, "epoch": 0.43029464109155624, "mean_token_accuracy": 0.7715458273887634, "num_tokens": 23921245.0, "step": 4352, "train/ce_loss": 0.45433947443962097 }, { "epoch": 0.43029464109155624, "step": 4352, "train/sim_loss": 0.08984375 }, { "epoch": 0.43029464109155624, "step": 4352, "train/total_loss": 0.13527770340442657 }, { "entropy": 8.608983993530273, "epoch": 0.4303935139410718, "mean_token_accuracy": 0.7964015007019043, "num_tokens": 23926826.0, "step": 4353, "train/ce_loss": 0.7558434009552002 }, { "epoch": 0.4303935139410718, "step": 4353, "train/sim_loss": 0.05859375 }, { "epoch": 0.4303935139410718, "step": 4353, "train/total_loss": 0.13417810201644897 }, { "entropy": 9.117816925048828, "epoch": 0.4304923867905873, "mean_token_accuracy": 0.7804014086723328, "num_tokens": 23932195.0, "step": 4354, "train/ce_loss": 0.2255360335111618 }, { "epoch": 0.4304923867905873, "step": 4354, "train/sim_loss": 0.08203125 }, { "epoch": 0.4304923867905873, "step": 4354, "train/total_loss": 0.10458485782146454 }, { "entropy": 9.113489151000977, "epoch": 0.4305912596401028, "mean_token_accuracy": 0.7012500166893005, "num_tokens": 23937603.0, "step": 4355, "train/ce_loss": 1.3321702480316162 }, { "epoch": 0.4305912596401028, "step": 4355, "train/sim_loss": 0.0703125 }, { "epoch": 0.4305912596401028, "step": 4355, "train/total_loss": 0.20352952182292938 }, { "entropy": 8.420955657958984, "epoch": 0.43069013248961835, "mean_token_accuracy": 0.7344173192977905, "num_tokens": 23943420.0, "step": 4356, "train/ce_loss": 1.3575314283370972 }, { "epoch": 0.43069013248961835, "step": 4356, "train/sim_loss": 0.0625 }, { "epoch": 0.43069013248961835, "step": 4356, "train/total_loss": 0.19825313985347748 }, { "entropy": 9.089115142822266, "epoch": 0.4307890053391339, "mean_token_accuracy": 0.7591742873191833, "num_tokens": 23949028.0, "step": 4357, "train/ce_loss": 0.5489127039909363 }, { "epoch": 0.4307890053391339, "step": 4357, "train/sim_loss": 0.09375 }, { "epoch": 0.4307890053391339, "step": 4357, "train/total_loss": 0.14864127337932587 }, { "entropy": 8.851123809814453, "epoch": 0.4308878781886494, "mean_token_accuracy": 0.7793263792991638, "num_tokens": 23954473.0, "step": 4358, "train/ce_loss": 0.5746714472770691 }, { "epoch": 0.4308878781886494, "step": 4358, "train/sim_loss": 0.0546875 }, { "epoch": 0.4308878781886494, "step": 4358, "train/total_loss": 0.11215464770793915 }, { "entropy": 9.089125633239746, "epoch": 0.4309867510381649, "mean_token_accuracy": 0.7788888812065125, "num_tokens": 23960134.0, "step": 4359, "train/ce_loss": 0.7912288904190063 }, { "epoch": 0.4309867510381649, "step": 4359, "train/sim_loss": 0.125 }, { "epoch": 0.4309867510381649, "step": 4359, "train/total_loss": 0.2041229009628296 }, { "epoch": 0.43108562388768046, "grad_norm": 0.6629348397254944, "learning_rate": 8.924739158384019e-06, "loss": 0.1432, "step": 4360 }, { "entropy": 9.287641525268555, "epoch": 0.43108562388768046, "mean_token_accuracy": 0.7330247163772583, "num_tokens": 23965514.0, "step": 4360, "train/ce_loss": 0.925274670124054 }, { "epoch": 0.43108562388768046, "step": 4360, "train/sim_loss": 0.11328125 }, { "epoch": 0.43108562388768046, "step": 4360, "train/total_loss": 0.20580872893333435 }, { "entropy": 8.586284637451172, "epoch": 0.43118449673719594, "mean_token_accuracy": 0.7654822468757629, "num_tokens": 23971091.0, "step": 4361, "train/ce_loss": 0.44881346821784973 }, { "epoch": 0.43118449673719594, "step": 4361, "train/sim_loss": 0.078125 }, { "epoch": 0.43118449673719594, "step": 4361, "train/total_loss": 0.12300634384155273 }, { "entropy": 8.729338645935059, "epoch": 0.4312833695867115, "mean_token_accuracy": 0.771458089351654, "num_tokens": 23976645.0, "step": 4362, "train/ce_loss": 0.9649606347084045 }, { "epoch": 0.4312833695867115, "step": 4362, "train/sim_loss": 0.08203125 }, { "epoch": 0.4312833695867115, "step": 4362, "train/total_loss": 0.1785273253917694 }, { "entropy": 8.863934516906738, "epoch": 0.431382242436227, "mean_token_accuracy": 0.7798165082931519, "num_tokens": 23982129.0, "step": 4363, "train/ce_loss": 0.6917652487754822 }, { "epoch": 0.431382242436227, "step": 4363, "train/sim_loss": 0.0390625 }, { "epoch": 0.431382242436227, "step": 4363, "train/total_loss": 0.10823902487754822 }, { "entropy": 8.95468807220459, "epoch": 0.4314811152857425, "mean_token_accuracy": 0.706606924533844, "num_tokens": 23987624.0, "step": 4364, "train/ce_loss": 0.5103545188903809 }, { "epoch": 0.4314811152857425, "step": 4364, "train/sim_loss": 0.0234375 }, { "epoch": 0.4314811152857425, "step": 4364, "train/total_loss": 0.07447294890880585 }, { "entropy": 8.819828987121582, "epoch": 0.43157998813525805, "mean_token_accuracy": 0.727918803691864, "num_tokens": 23993197.0, "step": 4365, "train/ce_loss": 1.6228585243225098 }, { "epoch": 0.43157998813525805, "step": 4365, "train/sim_loss": 0.1484375 }, { "epoch": 0.43157998813525805, "step": 4365, "train/total_loss": 0.31072336435317993 }, { "entropy": 9.073395729064941, "epoch": 0.4316788609847736, "mean_token_accuracy": 0.7098445892333984, "num_tokens": 23998670.0, "step": 4366, "train/ce_loss": 1.2265137434005737 }, { "epoch": 0.4316788609847736, "step": 4366, "train/sim_loss": 0.15234375 }, { "epoch": 0.4316788609847736, "step": 4366, "train/total_loss": 0.2749951183795929 }, { "entropy": 9.20622444152832, "epoch": 0.4317777338342891, "mean_token_accuracy": 0.7317743897438049, "num_tokens": 24004009.0, "step": 4367, "train/ce_loss": 1.0135321617126465 }, { "epoch": 0.4317777338342891, "step": 4367, "train/sim_loss": 0.078125 }, { "epoch": 0.4317777338342891, "step": 4367, "train/total_loss": 0.1794782280921936 }, { "entropy": 9.037958145141602, "epoch": 0.4318766066838046, "mean_token_accuracy": 0.7271589636802673, "num_tokens": 24009456.0, "step": 4368, "train/ce_loss": 0.7288763523101807 }, { "epoch": 0.4318766066838046, "step": 4368, "train/sim_loss": 0.03515625 }, { "epoch": 0.4318766066838046, "step": 4368, "train/total_loss": 0.10804388672113419 }, { "entropy": 8.75796127319336, "epoch": 0.43197547953332016, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 24014966.0, "step": 4369, "train/ce_loss": 0.6646449565887451 }, { "epoch": 0.43197547953332016, "step": 4369, "train/sim_loss": 0.04296875 }, { "epoch": 0.43197547953332016, "step": 4369, "train/total_loss": 0.10943324863910675 }, { "entropy": 8.867822647094727, "epoch": 0.43207435238283565, "mean_token_accuracy": 0.7532861232757568, "num_tokens": 24020601.0, "step": 4370, "train/ce_loss": 0.6802073121070862 }, { "epoch": 0.43207435238283565, "step": 4370, "train/sim_loss": 0.01953125 }, { "epoch": 0.43207435238283565, "step": 4370, "train/total_loss": 0.08755198121070862 }, { "entropy": 9.096464157104492, "epoch": 0.4321732252323512, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 24026073.0, "step": 4371, "train/ce_loss": 0.9358828067779541 }, { "epoch": 0.4321732252323512, "step": 4371, "train/sim_loss": 0.05859375 }, { "epoch": 0.4321732252323512, "step": 4371, "train/total_loss": 0.15218204259872437 }, { "entropy": 8.621452331542969, "epoch": 0.43227209808186673, "mean_token_accuracy": 0.7831558585166931, "num_tokens": 24031729.0, "step": 4372, "train/ce_loss": 0.5406470894813538 }, { "epoch": 0.43227209808186673, "step": 4372, "train/sim_loss": 0.0234375 }, { "epoch": 0.43227209808186673, "step": 4372, "train/total_loss": 0.07750220596790314 }, { "entropy": 9.161718368530273, "epoch": 0.4323709709313822, "mean_token_accuracy": 0.75789475440979, "num_tokens": 24037112.0, "step": 4373, "train/ce_loss": 0.8027073740959167 }, { "epoch": 0.4323709709313822, "step": 4373, "train/sim_loss": 0.078125 }, { "epoch": 0.4323709709313822, "step": 4373, "train/total_loss": 0.15839573740959167 }, { "entropy": 8.834321975708008, "epoch": 0.43246984378089776, "mean_token_accuracy": 0.7979999780654907, "num_tokens": 24042704.0, "step": 4374, "train/ce_loss": 0.42851537466049194 }, { "epoch": 0.43246984378089776, "step": 4374, "train/sim_loss": 0.0234375 }, { "epoch": 0.43246984378089776, "step": 4374, "train/total_loss": 0.0662890374660492 }, { "entropy": 8.858553886413574, "epoch": 0.4325687166304133, "mean_token_accuracy": 0.7425025701522827, "num_tokens": 24048235.0, "step": 4375, "train/ce_loss": 1.0219181776046753 }, { "epoch": 0.4325687166304133, "step": 4375, "train/sim_loss": 0.05859375 }, { "epoch": 0.4325687166304133, "step": 4375, "train/total_loss": 0.16078557074069977 }, { "entropy": 8.950654029846191, "epoch": 0.43266758947992884, "mean_token_accuracy": 0.7595269680023193, "num_tokens": 24053658.0, "step": 4376, "train/ce_loss": 0.8719038963317871 }, { "epoch": 0.43266758947992884, "step": 4376, "train/sim_loss": 0.05078125 }, { "epoch": 0.43266758947992884, "step": 4376, "train/total_loss": 0.1379716396331787 }, { "entropy": 9.074840545654297, "epoch": 0.4327664623294443, "mean_token_accuracy": 0.7555012106895447, "num_tokens": 24059070.0, "step": 4377, "train/ce_loss": 0.8076412677764893 }, { "epoch": 0.4327664623294443, "step": 4377, "train/sim_loss": 0.03125 }, { "epoch": 0.4327664623294443, "step": 4377, "train/total_loss": 0.11201412975788116 }, { "entropy": 9.022673606872559, "epoch": 0.43286533517895986, "mean_token_accuracy": 0.7737818956375122, "num_tokens": 24064560.0, "step": 4378, "train/ce_loss": 0.8465703129768372 }, { "epoch": 0.43286533517895986, "step": 4378, "train/sim_loss": 0.015625 }, { "epoch": 0.43286533517895986, "step": 4378, "train/total_loss": 0.10028203576803207 }, { "entropy": 9.42231559753418, "epoch": 0.4329642080284754, "mean_token_accuracy": 0.7522255182266235, "num_tokens": 24069876.0, "step": 4379, "train/ce_loss": 0.581962525844574 }, { "epoch": 0.4329642080284754, "step": 4379, "train/sim_loss": 0.03125 }, { "epoch": 0.4329642080284754, "step": 4379, "train/total_loss": 0.08944625407457352 }, { "epoch": 0.4330630808779909, "grad_norm": 0.8169946074485779, "learning_rate": 8.91979429362607e-06, "loss": 0.1343, "step": 4380 }, { "entropy": 9.012482643127441, "epoch": 0.4330630808779909, "mean_token_accuracy": 0.800000011920929, "num_tokens": 24075329.0, "step": 4380, "train/ce_loss": 0.37300965189933777 }, { "epoch": 0.4330630808779909, "step": 4380, "train/sim_loss": 0.0234375 }, { "epoch": 0.4330630808779909, "step": 4380, "train/total_loss": 0.060738466680049896 }, { "entropy": 8.887073516845703, "epoch": 0.43316195372750643, "mean_token_accuracy": 0.7603938579559326, "num_tokens": 24080858.0, "step": 4381, "train/ce_loss": 0.8467317223548889 }, { "epoch": 0.43316195372750643, "step": 4381, "train/sim_loss": 0.015625 }, { "epoch": 0.43316195372750643, "step": 4381, "train/total_loss": 0.10029817372560501 }, { "entropy": 9.30130672454834, "epoch": 0.433260826577022, "mean_token_accuracy": 0.7568681240081787, "num_tokens": 24086189.0, "step": 4382, "train/ce_loss": 1.047757625579834 }, { "epoch": 0.433260826577022, "step": 4382, "train/sim_loss": 0.0390625 }, { "epoch": 0.433260826577022, "step": 4382, "train/total_loss": 0.14383825659751892 }, { "entropy": 8.977653503417969, "epoch": 0.43335969942653746, "mean_token_accuracy": 0.7460317611694336, "num_tokens": 24091684.0, "step": 4383, "train/ce_loss": 1.0849130153656006 }, { "epoch": 0.43335969942653746, "step": 4383, "train/sim_loss": 0.03125 }, { "epoch": 0.43335969942653746, "step": 4383, "train/total_loss": 0.13974130153656006 }, { "entropy": 8.855316162109375, "epoch": 0.433458572276053, "mean_token_accuracy": 0.7953540086746216, "num_tokens": 24097261.0, "step": 4384, "train/ce_loss": 0.9016016721725464 }, { "epoch": 0.433458572276053, "step": 4384, "train/sim_loss": 0.015625 }, { "epoch": 0.433458572276053, "step": 4384, "train/total_loss": 0.10578516870737076 }, { "entropy": 9.202747344970703, "epoch": 0.43355744512556854, "mean_token_accuracy": 0.7560663819313049, "num_tokens": 24102803.0, "step": 4385, "train/ce_loss": 0.596872866153717 }, { "epoch": 0.43355744512556854, "step": 4385, "train/sim_loss": 0.0234375 }, { "epoch": 0.43355744512556854, "step": 4385, "train/total_loss": 0.0831247866153717 }, { "entropy": 9.080239295959473, "epoch": 0.433656317975084, "mean_token_accuracy": 0.7268232107162476, "num_tokens": 24108163.0, "step": 4386, "train/ce_loss": 0.713167667388916 }, { "epoch": 0.433656317975084, "step": 4386, "train/sim_loss": 0.0234375 }, { "epoch": 0.433656317975084, "step": 4386, "train/total_loss": 0.09475427120923996 }, { "entropy": 9.060267448425293, "epoch": 0.43375519082459957, "mean_token_accuracy": 0.7776618003845215, "num_tokens": 24113684.0, "step": 4387, "train/ce_loss": 0.53919917345047 }, { "epoch": 0.43375519082459957, "step": 4387, "train/sim_loss": 0.015625 }, { "epoch": 0.43375519082459957, "step": 4387, "train/total_loss": 0.06954491883516312 }, { "entropy": 9.085131645202637, "epoch": 0.4338540636741151, "mean_token_accuracy": 0.75, "num_tokens": 24119296.0, "step": 4388, "train/ce_loss": 0.45214515924453735 }, { "epoch": 0.4338540636741151, "step": 4388, "train/sim_loss": 0.05078125 }, { "epoch": 0.4338540636741151, "step": 4388, "train/total_loss": 0.09599576890468597 }, { "entropy": 8.889970779418945, "epoch": 0.4339529365236306, "mean_token_accuracy": 0.7626699805259705, "num_tokens": 24124688.0, "step": 4389, "train/ce_loss": 0.7740401029586792 }, { "epoch": 0.4339529365236306, "step": 4389, "train/sim_loss": 0.06640625 }, { "epoch": 0.4339529365236306, "step": 4389, "train/total_loss": 0.14381027221679688 }, { "entropy": 9.045398712158203, "epoch": 0.43405180937314614, "mean_token_accuracy": 0.7737143039703369, "num_tokens": 24130224.0, "step": 4390, "train/ce_loss": 0.8042582273483276 }, { "epoch": 0.43405180937314614, "step": 4390, "train/sim_loss": 0.03125 }, { "epoch": 0.43405180937314614, "step": 4390, "train/total_loss": 0.11167582124471664 }, { "entropy": 9.377883911132812, "epoch": 0.4341506822226617, "mean_token_accuracy": 0.7564979195594788, "num_tokens": 24135537.0, "step": 4391, "train/ce_loss": 0.9751831293106079 }, { "epoch": 0.4341506822226617, "step": 4391, "train/sim_loss": 0.0625 }, { "epoch": 0.4341506822226617, "step": 4391, "train/total_loss": 0.16001832485198975 }, { "entropy": 9.322731018066406, "epoch": 0.43424955507217716, "mean_token_accuracy": 0.7481590509414673, "num_tokens": 24140847.0, "step": 4392, "train/ce_loss": 0.5957899689674377 }, { "epoch": 0.43424955507217716, "step": 4392, "train/sim_loss": 0.0390625 }, { "epoch": 0.43424955507217716, "step": 4392, "train/total_loss": 0.09864149987697601 }, { "entropy": 8.948532104492188, "epoch": 0.4343484279216927, "mean_token_accuracy": 0.7505176067352295, "num_tokens": 24146431.0, "step": 4393, "train/ce_loss": 0.7130733728408813 }, { "epoch": 0.4343484279216927, "step": 4393, "train/sim_loss": 0.04296875 }, { "epoch": 0.4343484279216927, "step": 4393, "train/total_loss": 0.11427608877420425 }, { "entropy": 8.916778564453125, "epoch": 0.43444730077120824, "mean_token_accuracy": 0.760869562625885, "num_tokens": 24152018.0, "step": 4394, "train/ce_loss": 0.7320356369018555 }, { "epoch": 0.43444730077120824, "step": 4394, "train/sim_loss": 0.07421875 }, { "epoch": 0.43444730077120824, "step": 4394, "train/total_loss": 0.14742231369018555 }, { "entropy": 9.059212684631348, "epoch": 0.43454617362072373, "mean_token_accuracy": 0.684596598148346, "num_tokens": 24157391.0, "step": 4395, "train/ce_loss": 2.2738263607025146 }, { "epoch": 0.43454617362072373, "step": 4395, "train/sim_loss": 0.0703125 }, { "epoch": 0.43454617362072373, "step": 4395, "train/total_loss": 0.2976951599121094 }, { "entropy": 9.000699996948242, "epoch": 0.43464504647023927, "mean_token_accuracy": 0.7173219919204712, "num_tokens": 24162966.0, "step": 4396, "train/ce_loss": 1.1289267539978027 }, { "epoch": 0.43464504647023927, "step": 4396, "train/sim_loss": 0.125 }, { "epoch": 0.43464504647023927, "step": 4396, "train/total_loss": 0.23789268732070923 }, { "entropy": 9.441609382629395, "epoch": 0.4347439193197548, "mean_token_accuracy": 0.7805255055427551, "num_tokens": 24168249.0, "step": 4397, "train/ce_loss": 0.4067108631134033 }, { "epoch": 0.4347439193197548, "step": 4397, "train/sim_loss": 0.01953125 }, { "epoch": 0.4347439193197548, "step": 4397, "train/total_loss": 0.06020233780145645 }, { "entropy": 8.959494590759277, "epoch": 0.4348427921692703, "mean_token_accuracy": 0.7541766166687012, "num_tokens": 24173722.0, "step": 4398, "train/ce_loss": 1.038891315460205 }, { "epoch": 0.4348427921692703, "step": 4398, "train/sim_loss": 0.03125 }, { "epoch": 0.4348427921692703, "step": 4398, "train/total_loss": 0.13513913750648499 }, { "entropy": 9.027135848999023, "epoch": 0.43494166501878584, "mean_token_accuracy": 0.8118932247161865, "num_tokens": 24179321.0, "step": 4399, "train/ce_loss": 0.7304800152778625 }, { "epoch": 0.43494166501878584, "step": 4399, "train/sim_loss": 0.08203125 }, { "epoch": 0.43494166501878584, "step": 4399, "train/total_loss": 0.15507924556732178 }, { "epoch": 0.4350405378683014, "grad_norm": 0.6293058395385742, "learning_rate": 8.914849428868121e-06, "loss": 0.1343, "step": 4400 }, { "entropy": 9.03187084197998, "epoch": 0.4350405378683014, "mean_token_accuracy": 0.7318181991577148, "num_tokens": 24184747.0, "step": 4400, "train/ce_loss": 0.6305906176567078 }, { "epoch": 0.4350405378683014, "step": 4400, "train/sim_loss": 0.0390625 }, { "epoch": 0.4350405378683014, "step": 4400, "train/total_loss": 0.10212156176567078 }, { "entropy": 8.360186576843262, "epoch": 0.43513941071781687, "mean_token_accuracy": 0.7478991746902466, "num_tokens": 24190491.0, "step": 4401, "train/ce_loss": 0.991296648979187 }, { "epoch": 0.43513941071781687, "step": 4401, "train/sim_loss": 0.0625 }, { "epoch": 0.43513941071781687, "step": 4401, "train/total_loss": 0.16162967681884766 }, { "entropy": 9.112241744995117, "epoch": 0.4352382835673324, "mean_token_accuracy": 0.7010078430175781, "num_tokens": 24195997.0, "step": 4402, "train/ce_loss": 1.6751786470413208 }, { "epoch": 0.4352382835673324, "step": 4402, "train/sim_loss": 0.08984375 }, { "epoch": 0.4352382835673324, "step": 4402, "train/total_loss": 0.25736162066459656 }, { "entropy": 9.317626953125, "epoch": 0.43533715641684795, "mean_token_accuracy": 0.7831858396530151, "num_tokens": 24201264.0, "step": 4403, "train/ce_loss": 0.607154905796051 }, { "epoch": 0.43533715641684795, "step": 4403, "train/sim_loss": 0.046875 }, { "epoch": 0.43533715641684795, "step": 4403, "train/total_loss": 0.10759049654006958 }, { "entropy": 9.147956848144531, "epoch": 0.43543602926636343, "mean_token_accuracy": 0.7312348484992981, "num_tokens": 24206661.0, "step": 4404, "train/ce_loss": 1.042081356048584 }, { "epoch": 0.43543602926636343, "step": 4404, "train/sim_loss": 0.03515625 }, { "epoch": 0.43543602926636343, "step": 4404, "train/total_loss": 0.13936439156532288 }, { "entropy": 8.7960844039917, "epoch": 0.435534902115879, "mean_token_accuracy": 0.7344086170196533, "num_tokens": 24212351.0, "step": 4405, "train/ce_loss": 1.2465267181396484 }, { "epoch": 0.435534902115879, "step": 4405, "train/sim_loss": 0.03515625 }, { "epoch": 0.435534902115879, "step": 4405, "train/total_loss": 0.1598089337348938 }, { "entropy": 8.830793380737305, "epoch": 0.4356337749653945, "mean_token_accuracy": 0.7639485001564026, "num_tokens": 24217925.0, "step": 4406, "train/ce_loss": 0.4985183775424957 }, { "epoch": 0.4356337749653945, "step": 4406, "train/sim_loss": 0.0625 }, { "epoch": 0.4356337749653945, "step": 4406, "train/total_loss": 0.11235183477401733 }, { "entropy": 8.838595390319824, "epoch": 0.43573264781491, "mean_token_accuracy": 0.759036123752594, "num_tokens": 24223482.0, "step": 4407, "train/ce_loss": 0.5044899582862854 }, { "epoch": 0.43573264781491, "step": 4407, "train/sim_loss": 0.03125 }, { "epoch": 0.43573264781491, "step": 4407, "train/total_loss": 0.08169899880886078 }, { "entropy": 9.292428970336914, "epoch": 0.43583152066442554, "mean_token_accuracy": 0.7138964533805847, "num_tokens": 24228807.0, "step": 4408, "train/ce_loss": 1.3425885438919067 }, { "epoch": 0.43583152066442554, "step": 4408, "train/sim_loss": 0.08984375 }, { "epoch": 0.43583152066442554, "step": 4408, "train/total_loss": 0.22410260140895844 }, { "entropy": 9.341452598571777, "epoch": 0.4359303935139411, "mean_token_accuracy": 0.7607449889183044, "num_tokens": 24234178.0, "step": 4409, "train/ce_loss": 0.788894534111023 }, { "epoch": 0.4359303935139411, "step": 4409, "train/sim_loss": 0.07421875 }, { "epoch": 0.4359303935139411, "step": 4409, "train/total_loss": 0.15310820937156677 }, { "entropy": 8.999114990234375, "epoch": 0.43602926636345657, "mean_token_accuracy": 0.8039215803146362, "num_tokens": 24239663.0, "step": 4410, "train/ce_loss": 0.4156546890735626 }, { "epoch": 0.43602926636345657, "step": 4410, "train/sim_loss": 0.04296875 }, { "epoch": 0.43602926636345657, "step": 4410, "train/total_loss": 0.08453422039747238 }, { "entropy": 9.232725143432617, "epoch": 0.4361281392129721, "mean_token_accuracy": 0.7879161238670349, "num_tokens": 24245065.0, "step": 4411, "train/ce_loss": 0.5786951184272766 }, { "epoch": 0.4361281392129721, "step": 4411, "train/sim_loss": 0.05078125 }, { "epoch": 0.4361281392129721, "step": 4411, "train/total_loss": 0.10865075886249542 }, { "entropy": 8.989053726196289, "epoch": 0.43622701206248765, "mean_token_accuracy": 0.7195402383804321, "num_tokens": 24250529.0, "step": 4412, "train/ce_loss": 1.3834420442581177 }, { "epoch": 0.43622701206248765, "step": 4412, "train/sim_loss": 0.0390625 }, { "epoch": 0.43622701206248765, "step": 4412, "train/total_loss": 0.17740671336650848 }, { "entropy": 8.755338668823242, "epoch": 0.43632588491200314, "mean_token_accuracy": 0.7378516793251038, "num_tokens": 24255942.0, "step": 4413, "train/ce_loss": 0.6935248374938965 }, { "epoch": 0.43632588491200314, "step": 4413, "train/sim_loss": 0.0703125 }, { "epoch": 0.43632588491200314, "step": 4413, "train/total_loss": 0.13966497778892517 }, { "entropy": 9.087223052978516, "epoch": 0.4364247577615187, "mean_token_accuracy": 0.7044887542724609, "num_tokens": 24261363.0, "step": 4414, "train/ce_loss": 0.8355671763420105 }, { "epoch": 0.4364247577615187, "step": 4414, "train/sim_loss": 0.015625 }, { "epoch": 0.4364247577615187, "step": 4414, "train/total_loss": 0.09918171912431717 }, { "entropy": 9.004988670349121, "epoch": 0.4365236306110342, "mean_token_accuracy": 0.7115384340286255, "num_tokens": 24266732.0, "step": 4415, "train/ce_loss": 0.9240627884864807 }, { "epoch": 0.4365236306110342, "step": 4415, "train/sim_loss": 0.0546875 }, { "epoch": 0.4365236306110342, "step": 4415, "train/total_loss": 0.1470937728881836 }, { "entropy": 8.892204284667969, "epoch": 0.43662250346054976, "mean_token_accuracy": 0.7371007204055786, "num_tokens": 24272241.0, "step": 4416, "train/ce_loss": 0.9073285460472107 }, { "epoch": 0.43662250346054976, "step": 4416, "train/sim_loss": 0.04296875 }, { "epoch": 0.43662250346054976, "step": 4416, "train/total_loss": 0.1337016075849533 }, { "entropy": 8.869039535522461, "epoch": 0.43672137631006525, "mean_token_accuracy": 0.742821455001831, "num_tokens": 24277749.0, "step": 4417, "train/ce_loss": 1.0588479042053223 }, { "epoch": 0.43672137631006525, "step": 4417, "train/sim_loss": 0.109375 }, { "epoch": 0.43672137631006525, "step": 4417, "train/total_loss": 0.21525979042053223 }, { "entropy": 9.135133743286133, "epoch": 0.4368202491595808, "mean_token_accuracy": 0.6955445408821106, "num_tokens": 24283181.0, "step": 4418, "train/ce_loss": 0.9827292561531067 }, { "epoch": 0.4368202491595808, "step": 4418, "train/sim_loss": 0.0859375 }, { "epoch": 0.4368202491595808, "step": 4418, "train/total_loss": 0.1842104196548462 }, { "entropy": 9.215568542480469, "epoch": 0.43691912200909633, "mean_token_accuracy": 0.7354570627212524, "num_tokens": 24288490.0, "step": 4419, "train/ce_loss": 0.5740864276885986 }, { "epoch": 0.43691912200909633, "step": 4419, "train/sim_loss": 0.0546875 }, { "epoch": 0.43691912200909633, "step": 4419, "train/total_loss": 0.1120961457490921 }, { "epoch": 0.4370179948586118, "grad_norm": 0.7417439818382263, "learning_rate": 8.909904564110172e-06, "loss": 0.1437, "step": 4420 }, { "entropy": 9.019072532653809, "epoch": 0.4370179948586118, "mean_token_accuracy": 0.7257683277130127, "num_tokens": 24293949.0, "step": 4420, "train/ce_loss": 1.1143687963485718 }, { "epoch": 0.4370179948586118, "step": 4420, "train/sim_loss": 0.05859375 }, { "epoch": 0.4370179948586118, "step": 4420, "train/total_loss": 0.1700306236743927 }, { "entropy": 8.755172729492188, "epoch": 0.43711686770812735, "mean_token_accuracy": 0.7303482294082642, "num_tokens": 24299464.0, "step": 4421, "train/ce_loss": 0.48290976881980896 }, { "epoch": 0.43711686770812735, "step": 4421, "train/sim_loss": 0.0703125 }, { "epoch": 0.43711686770812735, "step": 4421, "train/total_loss": 0.11860348284244537 }, { "entropy": 9.05835247039795, "epoch": 0.4372157405576429, "mean_token_accuracy": 0.7339218258857727, "num_tokens": 24304773.0, "step": 4422, "train/ce_loss": 1.5115057229995728 }, { "epoch": 0.4372157405576429, "step": 4422, "train/sim_loss": 0.0859375 }, { "epoch": 0.4372157405576429, "step": 4422, "train/total_loss": 0.23708806931972504 }, { "entropy": 9.09469985961914, "epoch": 0.4373146134071584, "mean_token_accuracy": 0.7680473327636719, "num_tokens": 24310201.0, "step": 4423, "train/ce_loss": 0.9290922284126282 }, { "epoch": 0.4373146134071584, "step": 4423, "train/sim_loss": 0.08984375 }, { "epoch": 0.4373146134071584, "step": 4423, "train/total_loss": 0.18275296688079834 }, { "entropy": 8.965282440185547, "epoch": 0.4374134862566739, "mean_token_accuracy": 0.7617371082305908, "num_tokens": 24315715.0, "step": 4424, "train/ce_loss": 0.6252190470695496 }, { "epoch": 0.4374134862566739, "step": 4424, "train/sim_loss": 0.0625 }, { "epoch": 0.4374134862566739, "step": 4424, "train/total_loss": 0.12502190470695496 }, { "entropy": 8.677712440490723, "epoch": 0.43751235910618946, "mean_token_accuracy": 0.7193169593811035, "num_tokens": 24321259.0, "step": 4425, "train/ce_loss": 0.6982981562614441 }, { "epoch": 0.43751235910618946, "step": 4425, "train/sim_loss": 0.0546875 }, { "epoch": 0.43751235910618946, "step": 4425, "train/total_loss": 0.12451731413602829 }, { "entropy": 9.113158226013184, "epoch": 0.43761123195570495, "mean_token_accuracy": 0.727990984916687, "num_tokens": 24326768.0, "step": 4426, "train/ce_loss": 0.8153543472290039 }, { "epoch": 0.43761123195570495, "step": 4426, "train/sim_loss": 0.046875 }, { "epoch": 0.43761123195570495, "step": 4426, "train/total_loss": 0.1284104287624359 }, { "entropy": 8.983928680419922, "epoch": 0.4377101048052205, "mean_token_accuracy": 0.7664515972137451, "num_tokens": 24332239.0, "step": 4427, "train/ce_loss": 0.6154584884643555 }, { "epoch": 0.4377101048052205, "step": 4427, "train/sim_loss": 0.03125 }, { "epoch": 0.4377101048052205, "step": 4427, "train/total_loss": 0.09279584884643555 }, { "entropy": 8.71723461151123, "epoch": 0.43780897765473603, "mean_token_accuracy": 0.7037037014961243, "num_tokens": 24337825.0, "step": 4428, "train/ce_loss": 1.5019268989562988 }, { "epoch": 0.43780897765473603, "step": 4428, "train/sim_loss": 0.03125 }, { "epoch": 0.43780897765473603, "step": 4428, "train/total_loss": 0.18144269287586212 }, { "entropy": 8.733060836791992, "epoch": 0.4379078505042515, "mean_token_accuracy": 0.7765237092971802, "num_tokens": 24343307.0, "step": 4429, "train/ce_loss": 0.5003811120986938 }, { "epoch": 0.4379078505042515, "step": 4429, "train/sim_loss": 0.0859375 }, { "epoch": 0.4379078505042515, "step": 4429, "train/total_loss": 0.13597561419010162 }, { "entropy": 8.724028587341309, "epoch": 0.43800672335376706, "mean_token_accuracy": 0.7135325074195862, "num_tokens": 24348937.0, "step": 4430, "train/ce_loss": 1.8474000692367554 }, { "epoch": 0.43800672335376706, "step": 4430, "train/sim_loss": 0.04296875 }, { "epoch": 0.43800672335376706, "step": 4430, "train/total_loss": 0.22770875692367554 }, { "entropy": 9.01593017578125, "epoch": 0.4381055962032826, "mean_token_accuracy": 0.7108603715896606, "num_tokens": 24354294.0, "step": 4431, "train/ce_loss": 1.3813462257385254 }, { "epoch": 0.4381055962032826, "step": 4431, "train/sim_loss": 0.07421875 }, { "epoch": 0.4381055962032826, "step": 4431, "train/total_loss": 0.21235337853431702 }, { "entropy": 8.986350059509277, "epoch": 0.4382044690527981, "mean_token_accuracy": 0.712284505367279, "num_tokens": 24359832.0, "step": 4432, "train/ce_loss": 0.4970395267009735 }, { "epoch": 0.4382044690527981, "step": 4432, "train/sim_loss": 0.08203125 }, { "epoch": 0.4382044690527981, "step": 4432, "train/total_loss": 0.1317352056503296 }, { "entropy": 8.88647174835205, "epoch": 0.4383033419023136, "mean_token_accuracy": 0.734274685382843, "num_tokens": 24365255.0, "step": 4433, "train/ce_loss": 0.599248468875885 }, { "epoch": 0.4383033419023136, "step": 4433, "train/sim_loss": 0.08984375 }, { "epoch": 0.4383033419023136, "step": 4433, "train/total_loss": 0.14976859092712402 }, { "entropy": 8.999967575073242, "epoch": 0.43840221475182917, "mean_token_accuracy": 0.7843137383460999, "num_tokens": 24370791.0, "step": 4434, "train/ce_loss": 0.7487931251525879 }, { "epoch": 0.43840221475182917, "step": 4434, "train/sim_loss": 0.0546875 }, { "epoch": 0.43840221475182917, "step": 4434, "train/total_loss": 0.12956681847572327 }, { "entropy": 8.676895141601562, "epoch": 0.43850108760134465, "mean_token_accuracy": 0.7423664331436157, "num_tokens": 24376481.0, "step": 4435, "train/ce_loss": 0.9853678941726685 }, { "epoch": 0.43850108760134465, "step": 4435, "train/sim_loss": 0.05078125 }, { "epoch": 0.43850108760134465, "step": 4435, "train/total_loss": 0.14931803941726685 }, { "entropy": 8.479966163635254, "epoch": 0.4385999604508602, "mean_token_accuracy": 0.7068303823471069, "num_tokens": 24382395.0, "step": 4436, "train/ce_loss": 0.8324254751205444 }, { "epoch": 0.4385999604508602, "step": 4436, "train/sim_loss": 0.02734375 }, { "epoch": 0.4385999604508602, "step": 4436, "train/total_loss": 0.11058630049228668 }, { "entropy": 8.562737464904785, "epoch": 0.43869883330037573, "mean_token_accuracy": 0.7439786195755005, "num_tokens": 24388133.0, "step": 4437, "train/ce_loss": 0.9296632409095764 }, { "epoch": 0.43869883330037573, "step": 4437, "train/sim_loss": 0.078125 }, { "epoch": 0.43869883330037573, "step": 4437, "train/total_loss": 0.17109131813049316 }, { "entropy": 8.682954788208008, "epoch": 0.4387977061498912, "mean_token_accuracy": 0.7232597470283508, "num_tokens": 24393901.0, "step": 4438, "train/ce_loss": 0.5346986651420593 }, { "epoch": 0.4387977061498912, "step": 4438, "train/sim_loss": 0.08203125 }, { "epoch": 0.4387977061498912, "step": 4438, "train/total_loss": 0.13550111651420593 }, { "entropy": 9.221953392028809, "epoch": 0.43889657899940676, "mean_token_accuracy": 0.7690253853797913, "num_tokens": 24399234.0, "step": 4439, "train/ce_loss": 0.695620059967041 }, { "epoch": 0.43889657899940676, "step": 4439, "train/sim_loss": 0.078125 }, { "epoch": 0.43889657899940676, "step": 4439, "train/total_loss": 0.14768701791763306 }, { "epoch": 0.4389954518489223, "grad_norm": 0.7528851628303528, "learning_rate": 8.904959699352224e-06, "loss": 0.1421, "step": 4440 }, { "entropy": 8.984628677368164, "epoch": 0.4389954518489223, "mean_token_accuracy": 0.7580246925354004, "num_tokens": 24404608.0, "step": 4440, "train/ce_loss": 0.6328949332237244 }, { "epoch": 0.4389954518489223, "step": 4440, "train/sim_loss": 0.03515625 }, { "epoch": 0.4389954518489223, "step": 4440, "train/total_loss": 0.09844574332237244 }, { "entropy": 8.974782943725586, "epoch": 0.4390943246984378, "mean_token_accuracy": 0.648202121257782, "num_tokens": 24410198.0, "step": 4441, "train/ce_loss": 2.553335428237915 }, { "epoch": 0.4390943246984378, "step": 4441, "train/sim_loss": 0.06640625 }, { "epoch": 0.4390943246984378, "step": 4441, "train/total_loss": 0.3217397928237915 }, { "entropy": 9.00754165649414, "epoch": 0.43919319754795333, "mean_token_accuracy": 0.726881742477417, "num_tokens": 24415706.0, "step": 4442, "train/ce_loss": 1.4434539079666138 }, { "epoch": 0.43919319754795333, "step": 4442, "train/sim_loss": 0.0859375 }, { "epoch": 0.43919319754795333, "step": 4442, "train/total_loss": 0.23028288781642914 }, { "entropy": 8.674665451049805, "epoch": 0.43929207039746887, "mean_token_accuracy": 0.7505422830581665, "num_tokens": 24421260.0, "step": 4443, "train/ce_loss": 1.243742823600769 }, { "epoch": 0.43929207039746887, "step": 4443, "train/sim_loss": 0.06640625 }, { "epoch": 0.43929207039746887, "step": 4443, "train/total_loss": 0.19078053534030914 }, { "entropy": 9.327781677246094, "epoch": 0.43939094324698436, "mean_token_accuracy": 0.7450980544090271, "num_tokens": 24426545.0, "step": 4444, "train/ce_loss": 0.3182961344718933 }, { "epoch": 0.43939094324698436, "step": 4444, "train/sim_loss": 0.0625 }, { "epoch": 0.43939094324698436, "step": 4444, "train/total_loss": 0.09432961046695709 }, { "entropy": 8.524352073669434, "epoch": 0.4394898160964999, "mean_token_accuracy": 0.6870876550674438, "num_tokens": 24432198.0, "step": 4445, "train/ce_loss": 0.651814341545105 }, { "epoch": 0.4394898160964999, "step": 4445, "train/sim_loss": 0.08984375 }, { "epoch": 0.4394898160964999, "step": 4445, "train/total_loss": 0.1550251841545105 }, { "entropy": 8.696941375732422, "epoch": 0.43958868894601544, "mean_token_accuracy": 0.6822595000267029, "num_tokens": 24437947.0, "step": 4446, "train/ce_loss": 0.4244893491268158 }, { "epoch": 0.43958868894601544, "step": 4446, "train/sim_loss": 0.05078125 }, { "epoch": 0.43958868894601544, "step": 4446, "train/total_loss": 0.09323018789291382 }, { "entropy": 8.901978492736816, "epoch": 0.4396875617955309, "mean_token_accuracy": 0.7525380849838257, "num_tokens": 24443330.0, "step": 4447, "train/ce_loss": 0.7427362203598022 }, { "epoch": 0.4396875617955309, "step": 4447, "train/sim_loss": 0.0703125 }, { "epoch": 0.4396875617955309, "step": 4447, "train/total_loss": 0.14458611607551575 }, { "entropy": 8.864349365234375, "epoch": 0.43978643464504646, "mean_token_accuracy": 0.7530712485313416, "num_tokens": 24448718.0, "step": 4448, "train/ce_loss": 1.1789982318878174 }, { "epoch": 0.43978643464504646, "step": 4448, "train/sim_loss": 0.046875 }, { "epoch": 0.43978643464504646, "step": 4448, "train/total_loss": 0.1647748351097107 }, { "entropy": 8.888208389282227, "epoch": 0.439885307494562, "mean_token_accuracy": 0.7573839426040649, "num_tokens": 24454216.0, "step": 4449, "train/ce_loss": 1.0476337671279907 }, { "epoch": 0.439885307494562, "step": 4449, "train/sim_loss": 0.05859375 }, { "epoch": 0.439885307494562, "step": 4449, "train/total_loss": 0.16335713863372803 }, { "entropy": 9.04527473449707, "epoch": 0.4399841803440775, "mean_token_accuracy": 0.7639680504798889, "num_tokens": 24459708.0, "step": 4450, "train/ce_loss": 0.6891525983810425 }, { "epoch": 0.4399841803440775, "step": 4450, "train/sim_loss": 0.078125 }, { "epoch": 0.4399841803440775, "step": 4450, "train/total_loss": 0.1470402628183365 }, { "entropy": 8.744426727294922, "epoch": 0.44008305319359303, "mean_token_accuracy": 0.7828220725059509, "num_tokens": 24465228.0, "step": 4451, "train/ce_loss": 1.210342526435852 }, { "epoch": 0.44008305319359303, "step": 4451, "train/sim_loss": 0.09375 }, { "epoch": 0.44008305319359303, "step": 4451, "train/total_loss": 0.21478426456451416 }, { "entropy": 9.000271797180176, "epoch": 0.4401819260431086, "mean_token_accuracy": 0.7372972965240479, "num_tokens": 24470661.0, "step": 4452, "train/ce_loss": 0.7545905113220215 }, { "epoch": 0.4401819260431086, "step": 4452, "train/sim_loss": 0.0546875 }, { "epoch": 0.4401819260431086, "step": 4452, "train/total_loss": 0.1301465630531311 }, { "entropy": 9.16207504272461, "epoch": 0.44028079889262406, "mean_token_accuracy": 0.7597483992576599, "num_tokens": 24476024.0, "step": 4453, "train/ce_loss": 1.010150671005249 }, { "epoch": 0.44028079889262406, "step": 4453, "train/sim_loss": 0.02734375 }, { "epoch": 0.44028079889262406, "step": 4453, "train/total_loss": 0.12835881114006042 }, { "entropy": 8.739547729492188, "epoch": 0.4403796717421396, "mean_token_accuracy": 0.6908491253852844, "num_tokens": 24481742.0, "step": 4454, "train/ce_loss": 0.7576449513435364 }, { "epoch": 0.4403796717421396, "step": 4454, "train/sim_loss": 0.05078125 }, { "epoch": 0.4403796717421396, "step": 4454, "train/total_loss": 0.1265457570552826 }, { "entropy": 8.871628761291504, "epoch": 0.44047854459165514, "mean_token_accuracy": 0.7443850040435791, "num_tokens": 24487283.0, "step": 4455, "train/ce_loss": 0.7303671836853027 }, { "epoch": 0.44047854459165514, "step": 4455, "train/sim_loss": 0.03515625 }, { "epoch": 0.44047854459165514, "step": 4455, "train/total_loss": 0.10819297283887863 }, { "entropy": 9.034194946289062, "epoch": 0.4405774174411707, "mean_token_accuracy": 0.7156286835670471, "num_tokens": 24492794.0, "step": 4456, "train/ce_loss": 1.0618560314178467 }, { "epoch": 0.4405774174411707, "step": 4456, "train/sim_loss": 0.0546875 }, { "epoch": 0.4405774174411707, "step": 4456, "train/total_loss": 0.16087311506271362 }, { "entropy": 9.440287590026855, "epoch": 0.44067629029068617, "mean_token_accuracy": 0.7364016771316528, "num_tokens": 24498054.0, "step": 4457, "train/ce_loss": 0.9805535078048706 }, { "epoch": 0.44067629029068617, "step": 4457, "train/sim_loss": 0.0859375 }, { "epoch": 0.44067629029068617, "step": 4457, "train/total_loss": 0.18399286270141602 }, { "entropy": 8.828397750854492, "epoch": 0.4407751631402017, "mean_token_accuracy": 0.8032085299491882, "num_tokens": 24503634.0, "step": 4458, "train/ce_loss": 0.2798977494239807 }, { "epoch": 0.4407751631402017, "step": 4458, "train/sim_loss": 0.01171875 }, { "epoch": 0.4407751631402017, "step": 4458, "train/total_loss": 0.03970852494239807 }, { "entropy": 8.988703727722168, "epoch": 0.44087403598971725, "mean_token_accuracy": 0.7340425252914429, "num_tokens": 24509055.0, "step": 4459, "train/ce_loss": 1.2354345321655273 }, { "epoch": 0.44087403598971725, "step": 4459, "train/sim_loss": 0.1015625 }, { "epoch": 0.44087403598971725, "step": 4459, "train/total_loss": 0.22510595619678497 }, { "epoch": 0.44097290883923274, "grad_norm": 0.692470133304596, "learning_rate": 8.900014834594275e-06, "loss": 0.1431, "step": 4460 }, { "entropy": 9.182571411132812, "epoch": 0.44097290883923274, "mean_token_accuracy": 0.7702857255935669, "num_tokens": 24514497.0, "step": 4460, "train/ce_loss": 1.2816853523254395 }, { "epoch": 0.44097290883923274, "step": 4460, "train/sim_loss": 0.0625 }, { "epoch": 0.44097290883923274, "step": 4460, "train/total_loss": 0.19066853821277618 }, { "entropy": 9.070667266845703, "epoch": 0.4410717816887483, "mean_token_accuracy": 0.7628361582756042, "num_tokens": 24519923.0, "step": 4461, "train/ce_loss": 0.5079524517059326 }, { "epoch": 0.4410717816887483, "step": 4461, "train/sim_loss": 0.05078125 }, { "epoch": 0.4410717816887483, "step": 4461, "train/total_loss": 0.10157649219036102 }, { "entropy": 9.018738746643066, "epoch": 0.4411706545382638, "mean_token_accuracy": 0.7592592835426331, "num_tokens": 24525474.0, "step": 4462, "train/ce_loss": 0.639980137348175 }, { "epoch": 0.4411706545382638, "step": 4462, "train/sim_loss": 0.05859375 }, { "epoch": 0.4411706545382638, "step": 4462, "train/total_loss": 0.1225917637348175 }, { "entropy": 8.78824234008789, "epoch": 0.4412695273877793, "mean_token_accuracy": 0.7195828557014465, "num_tokens": 24531009.0, "step": 4463, "train/ce_loss": 1.3802077770233154 }, { "epoch": 0.4412695273877793, "step": 4463, "train/sim_loss": 0.05859375 }, { "epoch": 0.4412695273877793, "step": 4463, "train/total_loss": 0.19661453366279602 }, { "entropy": 9.220052719116211, "epoch": 0.44136840023729484, "mean_token_accuracy": 0.7226386666297913, "num_tokens": 24536350.0, "step": 4464, "train/ce_loss": 0.4691851735115051 }, { "epoch": 0.44136840023729484, "step": 4464, "train/sim_loss": 0.05859375 }, { "epoch": 0.44136840023729484, "step": 4464, "train/total_loss": 0.10551226884126663 }, { "entropy": 8.838144302368164, "epoch": 0.4414672730868104, "mean_token_accuracy": 0.7536092400550842, "num_tokens": 24541942.0, "step": 4465, "train/ce_loss": 0.5892795920372009 }, { "epoch": 0.4414672730868104, "step": 4465, "train/sim_loss": 0.09375 }, { "epoch": 0.4414672730868104, "step": 4465, "train/total_loss": 0.15267795324325562 }, { "entropy": 9.012930870056152, "epoch": 0.44156614593632587, "mean_token_accuracy": 0.8023391962051392, "num_tokens": 24547290.0, "step": 4466, "train/ce_loss": 0.8920934200286865 }, { "epoch": 0.44156614593632587, "step": 4466, "train/sim_loss": 0.046875 }, { "epoch": 0.44156614593632587, "step": 4466, "train/total_loss": 0.13608434796333313 }, { "entropy": 8.981893539428711, "epoch": 0.4416650187858414, "mean_token_accuracy": 0.763832688331604, "num_tokens": 24552618.0, "step": 4467, "train/ce_loss": 0.45448359847068787 }, { "epoch": 0.4416650187858414, "step": 4467, "train/sim_loss": 0.03125 }, { "epoch": 0.4416650187858414, "step": 4467, "train/total_loss": 0.07669836282730103 }, { "entropy": 8.927632331848145, "epoch": 0.44176389163535695, "mean_token_accuracy": 0.7209567427635193, "num_tokens": 24558132.0, "step": 4468, "train/ce_loss": 1.2017738819122314 }, { "epoch": 0.44176389163535695, "step": 4468, "train/sim_loss": 0.0234375 }, { "epoch": 0.44176389163535695, "step": 4468, "train/total_loss": 0.14361488819122314 }, { "entropy": 9.264801025390625, "epoch": 0.44186276448487244, "mean_token_accuracy": 0.7857142686843872, "num_tokens": 24563462.0, "step": 4469, "train/ce_loss": 0.9996740221977234 }, { "epoch": 0.44186276448487244, "step": 4469, "train/sim_loss": 0.046875 }, { "epoch": 0.44186276448487244, "step": 4469, "train/total_loss": 0.14684240520000458 }, { "entropy": 8.536149978637695, "epoch": 0.441961637334388, "mean_token_accuracy": 0.7493837475776672, "num_tokens": 24569201.0, "step": 4470, "train/ce_loss": 1.8537917137145996 }, { "epoch": 0.441961637334388, "step": 4470, "train/sim_loss": 0.06640625 }, { "epoch": 0.441961637334388, "step": 4470, "train/total_loss": 0.25178542733192444 }, { "entropy": 8.986776351928711, "epoch": 0.4420605101839035, "mean_token_accuracy": 0.8397027850151062, "num_tokens": 24574822.0, "step": 4471, "train/ce_loss": 0.37889334559440613 }, { "epoch": 0.4420605101839035, "step": 4471, "train/sim_loss": 0.0234375 }, { "epoch": 0.4420605101839035, "step": 4471, "train/total_loss": 0.06132683530449867 }, { "entropy": 8.939252853393555, "epoch": 0.442159383033419, "mean_token_accuracy": 0.7370466589927673, "num_tokens": 24580258.0, "step": 4472, "train/ce_loss": 0.5808043479919434 }, { "epoch": 0.442159383033419, "step": 4472, "train/sim_loss": 0.03515625 }, { "epoch": 0.442159383033419, "step": 4472, "train/total_loss": 0.09323668479919434 }, { "entropy": 8.87470817565918, "epoch": 0.44225825588293455, "mean_token_accuracy": 0.7608453631401062, "num_tokens": 24585805.0, "step": 4473, "train/ce_loss": 0.5109397768974304 }, { "epoch": 0.44225825588293455, "step": 4473, "train/sim_loss": 0.02734375 }, { "epoch": 0.44225825588293455, "step": 4473, "train/total_loss": 0.07843773066997528 }, { "entropy": 8.976507186889648, "epoch": 0.4423571287324501, "mean_token_accuracy": 0.7462857365608215, "num_tokens": 24591243.0, "step": 4474, "train/ce_loss": 1.1241252422332764 }, { "epoch": 0.4423571287324501, "step": 4474, "train/sim_loss": 0.06640625 }, { "epoch": 0.4423571287324501, "step": 4474, "train/total_loss": 0.17881877720355988 }, { "entropy": 9.068705558776855, "epoch": 0.4424560015819656, "mean_token_accuracy": 0.7712609767913818, "num_tokens": 24596524.0, "step": 4475, "train/ce_loss": 0.655724823474884 }, { "epoch": 0.4424560015819656, "step": 4475, "train/sim_loss": 0.08984375 }, { "epoch": 0.4424560015819656, "step": 4475, "train/total_loss": 0.15541623532772064 }, { "entropy": 9.069625854492188, "epoch": 0.4425548744314811, "mean_token_accuracy": 0.7294264435768127, "num_tokens": 24601954.0, "step": 4476, "train/ce_loss": 0.4554649293422699 }, { "epoch": 0.4425548744314811, "step": 4476, "train/sim_loss": 0.04296875 }, { "epoch": 0.4425548744314811, "step": 4476, "train/total_loss": 0.08851524442434311 }, { "entropy": 8.985418319702148, "epoch": 0.44265374728099666, "mean_token_accuracy": 0.7540983557701111, "num_tokens": 24607418.0, "step": 4477, "train/ce_loss": 0.5593478083610535 }, { "epoch": 0.44265374728099666, "step": 4477, "train/sim_loss": 0.1015625 }, { "epoch": 0.44265374728099666, "step": 4477, "train/total_loss": 0.15749728679656982 }, { "entropy": 8.964500427246094, "epoch": 0.44275262013051214, "mean_token_accuracy": 0.7024221420288086, "num_tokens": 24612905.0, "step": 4478, "train/ce_loss": 0.6686775088310242 }, { "epoch": 0.44275262013051214, "step": 4478, "train/sim_loss": 0.046875 }, { "epoch": 0.44275262013051214, "step": 4478, "train/total_loss": 0.11374275386333466 }, { "entropy": 9.261520385742188, "epoch": 0.4428514929800277, "mean_token_accuracy": 0.7691197395324707, "num_tokens": 24618194.0, "step": 4479, "train/ce_loss": 0.7675915360450745 }, { "epoch": 0.4428514929800277, "step": 4479, "train/sim_loss": 0.0546875 }, { "epoch": 0.4428514929800277, "step": 4479, "train/total_loss": 0.13144665956497192 }, { "epoch": 0.4429503658295432, "grad_norm": 0.7160285115242004, "learning_rate": 8.895069969836325e-06, "loss": 0.1381, "step": 4480 }, { "entropy": 9.244056701660156, "epoch": 0.4429503658295432, "mean_token_accuracy": 0.7503759264945984, "num_tokens": 24623513.0, "step": 4480, "train/ce_loss": 0.7926902174949646 }, { "epoch": 0.4429503658295432, "step": 4480, "train/sim_loss": 0.01953125 }, { "epoch": 0.4429503658295432, "step": 4480, "train/total_loss": 0.09880027174949646 }, { "entropy": 9.196931838989258, "epoch": 0.4430492386790587, "mean_token_accuracy": 0.75, "num_tokens": 24628969.0, "step": 4481, "train/ce_loss": 0.7969157099723816 }, { "epoch": 0.4430492386790587, "step": 4481, "train/sim_loss": 0.046875 }, { "epoch": 0.4430492386790587, "step": 4481, "train/total_loss": 0.1265665739774704 }, { "entropy": 8.845486640930176, "epoch": 0.44314811152857425, "mean_token_accuracy": 0.732292890548706, "num_tokens": 24634463.0, "step": 4482, "train/ce_loss": 1.1185194253921509 }, { "epoch": 0.44314811152857425, "step": 4482, "train/sim_loss": 0.03125 }, { "epoch": 0.44314811152857425, "step": 4482, "train/total_loss": 0.14310194551944733 }, { "entropy": 9.02662181854248, "epoch": 0.4432469843780898, "mean_token_accuracy": 0.7942857146263123, "num_tokens": 24639952.0, "step": 4483, "train/ce_loss": 0.9833078980445862 }, { "epoch": 0.4432469843780898, "step": 4483, "train/sim_loss": 0.109375 }, { "epoch": 0.4432469843780898, "step": 4483, "train/total_loss": 0.2077057957649231 }, { "entropy": 8.391189575195312, "epoch": 0.4433458572276053, "mean_token_accuracy": 0.7284615635871887, "num_tokens": 24645902.0, "step": 4484, "train/ce_loss": 0.6281371116638184 }, { "epoch": 0.4433458572276053, "step": 4484, "train/sim_loss": 0.05078125 }, { "epoch": 0.4433458572276053, "step": 4484, "train/total_loss": 0.11359496414661407 }, { "entropy": 8.943374633789062, "epoch": 0.4434447300771208, "mean_token_accuracy": 0.7262773513793945, "num_tokens": 24651381.0, "step": 4485, "train/ce_loss": 0.3582727015018463 }, { "epoch": 0.4434447300771208, "step": 4485, "train/sim_loss": 0.0234375 }, { "epoch": 0.4434447300771208, "step": 4485, "train/total_loss": 0.05926477164030075 }, { "entropy": 9.115114212036133, "epoch": 0.44354360292663636, "mean_token_accuracy": 0.7556390762329102, "num_tokens": 24656858.0, "step": 4486, "train/ce_loss": 0.4264664947986603 }, { "epoch": 0.44354360292663636, "step": 4486, "train/sim_loss": 0.05859375 }, { "epoch": 0.44354360292663636, "step": 4486, "train/total_loss": 0.10124039649963379 }, { "entropy": 8.699213981628418, "epoch": 0.44364247577615185, "mean_token_accuracy": 0.7429406046867371, "num_tokens": 24662568.0, "step": 4487, "train/ce_loss": 0.7069482803344727 }, { "epoch": 0.44364247577615185, "step": 4487, "train/sim_loss": 0.1015625 }, { "epoch": 0.44364247577615185, "step": 4487, "train/total_loss": 0.17225733399391174 }, { "entropy": 8.816816329956055, "epoch": 0.4437413486256674, "mean_token_accuracy": 0.7622180581092834, "num_tokens": 24668289.0, "step": 4488, "train/ce_loss": 0.5966444611549377 }, { "epoch": 0.4437413486256674, "step": 4488, "train/sim_loss": 0.05859375 }, { "epoch": 0.4437413486256674, "step": 4488, "train/total_loss": 0.11825819313526154 }, { "entropy": 9.147115707397461, "epoch": 0.44384022147518293, "mean_token_accuracy": 0.7515006065368652, "num_tokens": 24673720.0, "step": 4489, "train/ce_loss": 0.7837833166122437 }, { "epoch": 0.44384022147518293, "step": 4489, "train/sim_loss": 0.09375 }, { "epoch": 0.44384022147518293, "step": 4489, "train/total_loss": 0.1721283346414566 }, { "entropy": 9.094482421875, "epoch": 0.4439390943246984, "mean_token_accuracy": 0.7293413281440735, "num_tokens": 24679089.0, "step": 4490, "train/ce_loss": 0.9524022340774536 }, { "epoch": 0.4439390943246984, "step": 4490, "train/sim_loss": 0.08203125 }, { "epoch": 0.4439390943246984, "step": 4490, "train/total_loss": 0.17727148532867432 }, { "entropy": 9.025071144104004, "epoch": 0.44403796717421395, "mean_token_accuracy": 0.7471979856491089, "num_tokens": 24684535.0, "step": 4491, "train/ce_loss": 1.2400314807891846 }, { "epoch": 0.44403796717421395, "step": 4491, "train/sim_loss": 0.0859375 }, { "epoch": 0.44403796717421395, "step": 4491, "train/total_loss": 0.20994064211845398 }, { "entropy": 9.155872344970703, "epoch": 0.4441368400237295, "mean_token_accuracy": 0.7533692717552185, "num_tokens": 24689925.0, "step": 4492, "train/ce_loss": 0.6153109669685364 }, { "epoch": 0.4441368400237295, "step": 4492, "train/sim_loss": 0.05078125 }, { "epoch": 0.4441368400237295, "step": 4492, "train/total_loss": 0.11231234669685364 }, { "entropy": 9.129321098327637, "epoch": 0.444235712873245, "mean_token_accuracy": 0.811475396156311, "num_tokens": 24695330.0, "step": 4493, "train/ce_loss": 0.3830583989620209 }, { "epoch": 0.444235712873245, "step": 4493, "train/sim_loss": 0.0234375 }, { "epoch": 0.444235712873245, "step": 4493, "train/total_loss": 0.06174334138631821 }, { "entropy": 8.901857376098633, "epoch": 0.4443345857227605, "mean_token_accuracy": 0.7482014298439026, "num_tokens": 24700688.0, "step": 4494, "train/ce_loss": 0.9187345504760742 }, { "epoch": 0.4443345857227605, "step": 4494, "train/sim_loss": 0.0703125 }, { "epoch": 0.4443345857227605, "step": 4494, "train/total_loss": 0.16218596696853638 }, { "entropy": 8.88507080078125, "epoch": 0.44443345857227606, "mean_token_accuracy": 0.7202127575874329, "num_tokens": 24706175.0, "step": 4495, "train/ce_loss": 0.6692608594894409 }, { "epoch": 0.44443345857227606, "step": 4495, "train/sim_loss": 0.03125 }, { "epoch": 0.44443345857227606, "step": 4495, "train/total_loss": 0.09817608445882797 }, { "entropy": 9.10336685180664, "epoch": 0.44453233142179155, "mean_token_accuracy": 0.7287299633026123, "num_tokens": 24711559.0, "step": 4496, "train/ce_loss": 1.1186846494674683 }, { "epoch": 0.44453233142179155, "step": 4496, "train/sim_loss": 0.1484375 }, { "epoch": 0.44453233142179155, "step": 4496, "train/total_loss": 0.2603059709072113 }, { "entropy": 9.213830947875977, "epoch": 0.4446312042713071, "mean_token_accuracy": 0.7249357104301453, "num_tokens": 24716970.0, "step": 4497, "train/ce_loss": 1.5009890794754028 }, { "epoch": 0.4446312042713071, "step": 4497, "train/sim_loss": 0.0625 }, { "epoch": 0.4446312042713071, "step": 4497, "train/total_loss": 0.21259890496730804 }, { "entropy": 9.245721817016602, "epoch": 0.44473007712082263, "mean_token_accuracy": 0.7468879818916321, "num_tokens": 24722382.0, "step": 4498, "train/ce_loss": 0.757320761680603 }, { "epoch": 0.44473007712082263, "step": 4498, "train/sim_loss": 0.07421875 }, { "epoch": 0.44473007712082263, "step": 4498, "train/total_loss": 0.14995083212852478 }, { "entropy": 9.208671569824219, "epoch": 0.44482894997033817, "mean_token_accuracy": 0.751978874206543, "num_tokens": 24727717.0, "step": 4499, "train/ce_loss": 0.6780239343643188 }, { "epoch": 0.44482894997033817, "step": 4499, "train/sim_loss": 0.09375 }, { "epoch": 0.44482894997033817, "step": 4499, "train/total_loss": 0.16155239939689636 }, { "epoch": 0.44492782281985366, "grad_norm": 0.6984949111938477, "learning_rate": 8.890125105078377e-06, "loss": 0.1432, "step": 4500 }, { "entropy": 8.755548477172852, "epoch": 0.44492782281985366, "mean_token_accuracy": 0.7773398756980896, "num_tokens": 24733407.0, "step": 4500, "train/ce_loss": 0.6540170311927795 }, { "epoch": 0.44492782281985366, "step": 4500, "train/sim_loss": 0.09765625 }, { "epoch": 0.44492782281985366, "step": 4500, "train/total_loss": 0.16305795311927795 }, { "entropy": 9.302948951721191, "epoch": 0.4450266956693692, "mean_token_accuracy": 0.7354925870895386, "num_tokens": 24738767.0, "step": 4501, "train/ce_loss": 0.9180766344070435 }, { "epoch": 0.4450266956693692, "step": 4501, "train/sim_loss": 0.1015625 }, { "epoch": 0.4450266956693692, "step": 4501, "train/total_loss": 0.19337016344070435 }, { "entropy": 8.543020248413086, "epoch": 0.44512556851888474, "mean_token_accuracy": 0.7276595830917358, "num_tokens": 24744294.0, "step": 4502, "train/ce_loss": 0.5818716883659363 }, { "epoch": 0.44512556851888474, "step": 4502, "train/sim_loss": 0.0546875 }, { "epoch": 0.44512556851888474, "step": 4502, "train/total_loss": 0.11287467181682587 }, { "entropy": 8.981334686279297, "epoch": 0.4452244413684002, "mean_token_accuracy": 0.740276038646698, "num_tokens": 24749727.0, "step": 4503, "train/ce_loss": 0.4762517213821411 }, { "epoch": 0.4452244413684002, "step": 4503, "train/sim_loss": 0.06640625 }, { "epoch": 0.4452244413684002, "step": 4503, "train/total_loss": 0.11403141915798187 }, { "entropy": 9.141170501708984, "epoch": 0.44532331421791577, "mean_token_accuracy": 0.7598522305488586, "num_tokens": 24755067.0, "step": 4504, "train/ce_loss": 0.5775616765022278 }, { "epoch": 0.44532331421791577, "step": 4504, "train/sim_loss": 0.03515625 }, { "epoch": 0.44532331421791577, "step": 4504, "train/total_loss": 0.09291242063045502 }, { "entropy": 9.050025939941406, "epoch": 0.4454221870674313, "mean_token_accuracy": 0.749099612236023, "num_tokens": 24760575.0, "step": 4505, "train/ce_loss": 1.2573282718658447 }, { "epoch": 0.4454221870674313, "step": 4505, "train/sim_loss": 0.06640625 }, { "epoch": 0.4454221870674313, "step": 4505, "train/total_loss": 0.19213907420635223 }, { "entropy": 9.34515380859375, "epoch": 0.4455210599169468, "mean_token_accuracy": 0.7812911868095398, "num_tokens": 24765910.0, "step": 4506, "train/ce_loss": 0.8575665950775146 }, { "epoch": 0.4455210599169468, "step": 4506, "train/sim_loss": 0.0703125 }, { "epoch": 0.4455210599169468, "step": 4506, "train/total_loss": 0.15606915950775146 }, { "entropy": 9.020055770874023, "epoch": 0.44561993276646233, "mean_token_accuracy": 0.7645089030265808, "num_tokens": 24771425.0, "step": 4507, "train/ce_loss": 0.7966243624687195 }, { "epoch": 0.44561993276646233, "step": 4507, "train/sim_loss": 0.0703125 }, { "epoch": 0.44561993276646233, "step": 4507, "train/total_loss": 0.14997494220733643 }, { "entropy": 8.691665649414062, "epoch": 0.4457188056159779, "mean_token_accuracy": 0.7139534950256348, "num_tokens": 24776948.0, "step": 4508, "train/ce_loss": 0.7745142579078674 }, { "epoch": 0.4457188056159779, "step": 4508, "train/sim_loss": 0.1484375 }, { "epoch": 0.4457188056159779, "step": 4508, "train/total_loss": 0.2258889377117157 }, { "entropy": 9.262264251708984, "epoch": 0.44581767846549336, "mean_token_accuracy": 0.7456021904945374, "num_tokens": 24782299.0, "step": 4509, "train/ce_loss": 1.3613173961639404 }, { "epoch": 0.44581767846549336, "step": 4509, "train/sim_loss": 0.12109375 }, { "epoch": 0.44581767846549336, "step": 4509, "train/total_loss": 0.25722551345825195 }, { "entropy": 8.793807983398438, "epoch": 0.4459165513150089, "mean_token_accuracy": 0.7786790132522583, "num_tokens": 24787967.0, "step": 4510, "train/ce_loss": 0.8446704149246216 }, { "epoch": 0.4459165513150089, "step": 4510, "train/sim_loss": 0.1171875 }, { "epoch": 0.4459165513150089, "step": 4510, "train/total_loss": 0.2016545534133911 }, { "entropy": 8.947088241577148, "epoch": 0.44601542416452444, "mean_token_accuracy": 0.8115800023078918, "num_tokens": 24793608.0, "step": 4511, "train/ce_loss": 0.7708721160888672 }, { "epoch": 0.44601542416452444, "step": 4511, "train/sim_loss": 0.0703125 }, { "epoch": 0.44601542416452444, "step": 4511, "train/total_loss": 0.14739972352981567 }, { "entropy": 8.692684173583984, "epoch": 0.44611429701403993, "mean_token_accuracy": 0.7992007732391357, "num_tokens": 24799246.0, "step": 4512, "train/ce_loss": 0.7596167325973511 }, { "epoch": 0.44611429701403993, "step": 4512, "train/sim_loss": 0.1171875 }, { "epoch": 0.44611429701403993, "step": 4512, "train/total_loss": 0.19314917922019958 }, { "entropy": 9.099835395812988, "epoch": 0.44621316986355547, "mean_token_accuracy": 0.7812097668647766, "num_tokens": 24804638.0, "step": 4513, "train/ce_loss": 0.34384214878082275 }, { "epoch": 0.44621316986355547, "step": 4513, "train/sim_loss": 0.08984375 }, { "epoch": 0.44621316986355547, "step": 4513, "train/total_loss": 0.12422797083854675 }, { "entropy": 8.814414024353027, "epoch": 0.446312042713071, "mean_token_accuracy": 0.7107843160629272, "num_tokens": 24810094.0, "step": 4514, "train/ce_loss": 1.0511869192123413 }, { "epoch": 0.446312042713071, "step": 4514, "train/sim_loss": 0.05078125 }, { "epoch": 0.446312042713071, "step": 4514, "train/total_loss": 0.15589994192123413 }, { "entropy": 9.11019515991211, "epoch": 0.4464109155625865, "mean_token_accuracy": 0.7704081535339355, "num_tokens": 24815444.0, "step": 4515, "train/ce_loss": 0.3143487274646759 }, { "epoch": 0.4464109155625865, "step": 4515, "train/sim_loss": 0.02734375 }, { "epoch": 0.4464109155625865, "step": 4515, "train/total_loss": 0.05877862498164177 }, { "entropy": 8.844644546508789, "epoch": 0.44650978841210204, "mean_token_accuracy": 0.7257732152938843, "num_tokens": 24821033.0, "step": 4516, "train/ce_loss": 0.602832555770874 }, { "epoch": 0.44650978841210204, "step": 4516, "train/sim_loss": 0.05859375 }, { "epoch": 0.44650978841210204, "step": 4516, "train/total_loss": 0.11887700855731964 }, { "entropy": 8.68660831451416, "epoch": 0.4466086612616176, "mean_token_accuracy": 0.6948229074478149, "num_tokens": 24826757.0, "step": 4517, "train/ce_loss": 0.5787649154663086 }, { "epoch": 0.4466086612616176, "step": 4517, "train/sim_loss": 0.109375 }, { "epoch": 0.4466086612616176, "step": 4517, "train/total_loss": 0.16725149750709534 }, { "entropy": 9.130562782287598, "epoch": 0.44670753411113306, "mean_token_accuracy": 0.7001239061355591, "num_tokens": 24832268.0, "step": 4518, "train/ce_loss": 1.5998291969299316 }, { "epoch": 0.44670753411113306, "step": 4518, "train/sim_loss": 0.09765625 }, { "epoch": 0.44670753411113306, "step": 4518, "train/total_loss": 0.25763916969299316 }, { "entropy": 8.886415481567383, "epoch": 0.4468064069606486, "mean_token_accuracy": 0.7609603404998779, "num_tokens": 24837834.0, "step": 4519, "train/ce_loss": 1.0691168308258057 }, { "epoch": 0.4468064069606486, "step": 4519, "train/sim_loss": 0.07421875 }, { "epoch": 0.4468064069606486, "step": 4519, "train/total_loss": 0.18113043904304504 }, { "epoch": 0.44690527981016415, "grad_norm": 0.7360351085662842, "learning_rate": 8.885180240320428e-06, "loss": 0.1447, "step": 4520 }, { "entropy": 9.066244125366211, "epoch": 0.44690527981016415, "mean_token_accuracy": 0.698300302028656, "num_tokens": 24843060.0, "step": 4520, "train/ce_loss": 0.37150219082832336 }, { "epoch": 0.44690527981016415, "step": 4520, "train/sim_loss": 0.04296875 }, { "epoch": 0.44690527981016415, "step": 4520, "train/total_loss": 0.08011896908283234 }, { "entropy": 9.271928787231445, "epoch": 0.44700415265967963, "mean_token_accuracy": 0.7577720284461975, "num_tokens": 24848481.0, "step": 4521, "train/ce_loss": 1.0529297590255737 }, { "epoch": 0.44700415265967963, "step": 4521, "train/sim_loss": 0.0859375 }, { "epoch": 0.44700415265967963, "step": 4521, "train/total_loss": 0.19123047590255737 }, { "entropy": 9.043785095214844, "epoch": 0.4471030255091952, "mean_token_accuracy": 0.7431818246841431, "num_tokens": 24853953.0, "step": 4522, "train/ce_loss": 1.1406216621398926 }, { "epoch": 0.4471030255091952, "step": 4522, "train/sim_loss": 0.0859375 }, { "epoch": 0.4471030255091952, "step": 4522, "train/total_loss": 0.19999966025352478 }, { "entropy": 9.032511711120605, "epoch": 0.4472018983587107, "mean_token_accuracy": 0.7908992171287537, "num_tokens": 24859510.0, "step": 4523, "train/ce_loss": 0.8725542426109314 }, { "epoch": 0.4472018983587107, "step": 4523, "train/sim_loss": 0.0234375 }, { "epoch": 0.4472018983587107, "step": 4523, "train/total_loss": 0.11069292575120926 }, { "entropy": 9.011327743530273, "epoch": 0.4473007712082262, "mean_token_accuracy": 0.7128463387489319, "num_tokens": 24864895.0, "step": 4524, "train/ce_loss": 0.524046003818512 }, { "epoch": 0.4473007712082262, "step": 4524, "train/sim_loss": 0.0234375 }, { "epoch": 0.4473007712082262, "step": 4524, "train/total_loss": 0.07584209740161896 }, { "entropy": 8.948314666748047, "epoch": 0.44739964405774174, "mean_token_accuracy": 0.7513935565948486, "num_tokens": 24870385.0, "step": 4525, "train/ce_loss": 0.40651240944862366 }, { "epoch": 0.44739964405774174, "step": 4525, "train/sim_loss": 0.046875 }, { "epoch": 0.44739964405774174, "step": 4525, "train/total_loss": 0.08752624690532684 }, { "entropy": 9.028936386108398, "epoch": 0.4474985169072573, "mean_token_accuracy": 0.7730414867401123, "num_tokens": 24875861.0, "step": 4526, "train/ce_loss": 0.5208938121795654 }, { "epoch": 0.4474985169072573, "step": 4526, "train/sim_loss": 0.0234375 }, { "epoch": 0.4474985169072573, "step": 4526, "train/total_loss": 0.0755268782377243 }, { "entropy": 9.052875518798828, "epoch": 0.44759738975677277, "mean_token_accuracy": 0.7605294585227966, "num_tokens": 24881278.0, "step": 4527, "train/ce_loss": 0.6871906518936157 }, { "epoch": 0.44759738975677277, "step": 4527, "train/sim_loss": 0.0703125 }, { "epoch": 0.44759738975677277, "step": 4527, "train/total_loss": 0.1390315592288971 }, { "entropy": 8.599630355834961, "epoch": 0.4476962626062883, "mean_token_accuracy": 0.7614601254463196, "num_tokens": 24887046.0, "step": 4528, "train/ce_loss": 0.6945182085037231 }, { "epoch": 0.4476962626062883, "step": 4528, "train/sim_loss": 0.109375 }, { "epoch": 0.4476962626062883, "step": 4528, "train/total_loss": 0.17882682383060455 }, { "entropy": 9.025761604309082, "epoch": 0.44779513545580385, "mean_token_accuracy": 0.7310606241226196, "num_tokens": 24892424.0, "step": 4529, "train/ce_loss": 1.0578218698501587 }, { "epoch": 0.44779513545580385, "step": 4529, "train/sim_loss": 0.05859375 }, { "epoch": 0.44779513545580385, "step": 4529, "train/total_loss": 0.1643759310245514 }, { "entropy": 8.874309539794922, "epoch": 0.44789400830531934, "mean_token_accuracy": 0.767756462097168, "num_tokens": 24897909.0, "step": 4530, "train/ce_loss": 0.6031731367111206 }, { "epoch": 0.44789400830531934, "step": 4530, "train/sim_loss": 0.03515625 }, { "epoch": 0.44789400830531934, "step": 4530, "train/total_loss": 0.09547356516122818 }, { "entropy": 9.05894947052002, "epoch": 0.4479928811548349, "mean_token_accuracy": 0.7600459456443787, "num_tokens": 24903366.0, "step": 4531, "train/ce_loss": 0.3548393249511719 }, { "epoch": 0.4479928811548349, "step": 4531, "train/sim_loss": 0.0234375 }, { "epoch": 0.4479928811548349, "step": 4531, "train/total_loss": 0.05892143398523331 }, { "entropy": 8.922712326049805, "epoch": 0.4480917540043504, "mean_token_accuracy": 0.7034220695495605, "num_tokens": 24908785.0, "step": 4532, "train/ce_loss": 0.8477046489715576 }, { "epoch": 0.4480917540043504, "step": 4532, "train/sim_loss": 0.04296875 }, { "epoch": 0.4480917540043504, "step": 4532, "train/total_loss": 0.12773922085762024 }, { "entropy": 8.693946838378906, "epoch": 0.4481906268538659, "mean_token_accuracy": 0.7322916388511658, "num_tokens": 24914399.0, "step": 4533, "train/ce_loss": 1.2688677310943604 }, { "epoch": 0.4481906268538659, "step": 4533, "train/sim_loss": 0.0546875 }, { "epoch": 0.4481906268538659, "step": 4533, "train/total_loss": 0.1815742701292038 }, { "entropy": 9.12142562866211, "epoch": 0.44828949970338144, "mean_token_accuracy": 0.7485875487327576, "num_tokens": 24919713.0, "step": 4534, "train/ce_loss": 0.800855278968811 }, { "epoch": 0.44828949970338144, "step": 4534, "train/sim_loss": 0.0390625 }, { "epoch": 0.44828949970338144, "step": 4534, "train/total_loss": 0.11914803087711334 }, { "entropy": 8.951560974121094, "epoch": 0.448388372552897, "mean_token_accuracy": 0.7963190078735352, "num_tokens": 24925134.0, "step": 4535, "train/ce_loss": 0.7241727709770203 }, { "epoch": 0.448388372552897, "step": 4535, "train/sim_loss": 0.05859375 }, { "epoch": 0.448388372552897, "step": 4535, "train/total_loss": 0.13101103901863098 }, { "entropy": 8.651470184326172, "epoch": 0.44848724540241247, "mean_token_accuracy": 0.7243186831474304, "num_tokens": 24930611.0, "step": 4536, "train/ce_loss": 0.9688176512718201 }, { "epoch": 0.44848724540241247, "step": 4536, "train/sim_loss": 0.08203125 }, { "epoch": 0.44848724540241247, "step": 4536, "train/total_loss": 0.17891302704811096 }, { "entropy": 9.283239364624023, "epoch": 0.448586118251928, "mean_token_accuracy": 0.7013996839523315, "num_tokens": 24935871.0, "step": 4537, "train/ce_loss": 0.9390580654144287 }, { "epoch": 0.448586118251928, "step": 4537, "train/sim_loss": 0.0625 }, { "epoch": 0.448586118251928, "step": 4537, "train/total_loss": 0.15640580654144287 }, { "entropy": 8.953493118286133, "epoch": 0.44868499110144355, "mean_token_accuracy": 0.7777777910232544, "num_tokens": 24941328.0, "step": 4538, "train/ce_loss": 1.0196865797042847 }, { "epoch": 0.44868499110144355, "step": 4538, "train/sim_loss": 0.06640625 }, { "epoch": 0.44868499110144355, "step": 4538, "train/total_loss": 0.1683749109506607 }, { "entropy": 9.039117813110352, "epoch": 0.4487838639509591, "mean_token_accuracy": 0.7668571472167969, "num_tokens": 24946802.0, "step": 4539, "train/ce_loss": 0.9054794311523438 }, { "epoch": 0.4487838639509591, "step": 4539, "train/sim_loss": 0.0546875 }, { "epoch": 0.4487838639509591, "step": 4539, "train/total_loss": 0.14523544907569885 }, { "epoch": 0.4488827368004746, "grad_norm": 0.669406533241272, "learning_rate": 8.88023537556248e-06, "loss": 0.1346, "step": 4540 }, { "entropy": 8.749265670776367, "epoch": 0.4488827368004746, "mean_token_accuracy": 0.7139689326286316, "num_tokens": 24952333.0, "step": 4540, "train/ce_loss": 0.5315402150154114 }, { "epoch": 0.4488827368004746, "step": 4540, "train/sim_loss": 0.01953125 }, { "epoch": 0.4488827368004746, "step": 4540, "train/total_loss": 0.07268527150154114 }, { "entropy": 8.467789649963379, "epoch": 0.4489816096499901, "mean_token_accuracy": 0.7746967077255249, "num_tokens": 24958051.0, "step": 4541, "train/ce_loss": 0.5614739060401917 }, { "epoch": 0.4489816096499901, "step": 4541, "train/sim_loss": 0.05859375 }, { "epoch": 0.4489816096499901, "step": 4541, "train/total_loss": 0.11474114656448364 }, { "entropy": 8.902462005615234, "epoch": 0.44908048249950566, "mean_token_accuracy": 0.7242937684059143, "num_tokens": 24963675.0, "step": 4542, "train/ce_loss": 0.5321301221847534 }, { "epoch": 0.44908048249950566, "step": 4542, "train/sim_loss": 0.0703125 }, { "epoch": 0.44908048249950566, "step": 4542, "train/total_loss": 0.12352551519870758 }, { "entropy": 9.061014175415039, "epoch": 0.44917935534902115, "mean_token_accuracy": 0.7719298005104065, "num_tokens": 24969043.0, "step": 4543, "train/ce_loss": 0.46197453141212463 }, { "epoch": 0.44917935534902115, "step": 4543, "train/sim_loss": 0.05078125 }, { "epoch": 0.44917935534902115, "step": 4543, "train/total_loss": 0.09697870910167694 }, { "entropy": 9.040348052978516, "epoch": 0.4492782281985367, "mean_token_accuracy": 0.7462311387062073, "num_tokens": 24974416.0, "step": 4544, "train/ce_loss": 0.756935179233551 }, { "epoch": 0.4492782281985367, "step": 4544, "train/sim_loss": 0.08203125 }, { "epoch": 0.4492782281985367, "step": 4544, "train/total_loss": 0.1577247679233551 }, { "entropy": 9.141303062438965, "epoch": 0.44937710104805223, "mean_token_accuracy": 0.735805332660675, "num_tokens": 24980146.0, "step": 4545, "train/ce_loss": 0.7269299030303955 }, { "epoch": 0.44937710104805223, "step": 4545, "train/sim_loss": 0.078125 }, { "epoch": 0.44937710104805223, "step": 4545, "train/total_loss": 0.15081799030303955 }, { "entropy": 8.757617950439453, "epoch": 0.4494759738975677, "mean_token_accuracy": 0.7588978409767151, "num_tokens": 24985627.0, "step": 4546, "train/ce_loss": 0.7692416906356812 }, { "epoch": 0.4494759738975677, "step": 4546, "train/sim_loss": 0.07421875 }, { "epoch": 0.4494759738975677, "step": 4546, "train/total_loss": 0.1511429250240326 }, { "entropy": 8.916977882385254, "epoch": 0.44957484674708326, "mean_token_accuracy": 0.7337142825126648, "num_tokens": 24991135.0, "step": 4547, "train/ce_loss": 0.6681240200996399 }, { "epoch": 0.44957484674708326, "step": 4547, "train/sim_loss": 0.0546875 }, { "epoch": 0.44957484674708326, "step": 4547, "train/total_loss": 0.12149990350008011 }, { "entropy": 8.496770858764648, "epoch": 0.4496737195965988, "mean_token_accuracy": 0.7532923817634583, "num_tokens": 24996891.0, "step": 4548, "train/ce_loss": 0.5717994570732117 }, { "epoch": 0.4496737195965988, "step": 4548, "train/sim_loss": 0.01953125 }, { "epoch": 0.4496737195965988, "step": 4548, "train/total_loss": 0.07671119272708893 }, { "entropy": 9.08253288269043, "epoch": 0.4497725924461143, "mean_token_accuracy": 0.7863962054252625, "num_tokens": 25002344.0, "step": 4549, "train/ce_loss": 0.4656488001346588 }, { "epoch": 0.4497725924461143, "step": 4549, "train/sim_loss": 0.06640625 }, { "epoch": 0.4497725924461143, "step": 4549, "train/total_loss": 0.11297112703323364 }, { "entropy": 8.687976837158203, "epoch": 0.4498714652956298, "mean_token_accuracy": 0.7497725486755371, "num_tokens": 25007986.0, "step": 4550, "train/ce_loss": 1.7163137197494507 }, { "epoch": 0.4498714652956298, "step": 4550, "train/sim_loss": 0.03515625 }, { "epoch": 0.4498714652956298, "step": 4550, "train/total_loss": 0.20678763091564178 }, { "entropy": 9.258780479431152, "epoch": 0.44997033814514537, "mean_token_accuracy": 0.6920052170753479, "num_tokens": 25013395.0, "step": 4551, "train/ce_loss": 0.7935256361961365 }, { "epoch": 0.44997033814514537, "step": 4551, "train/sim_loss": 0.078125 }, { "epoch": 0.44997033814514537, "step": 4551, "train/total_loss": 0.15747755765914917 }, { "entropy": 8.807769775390625, "epoch": 0.45006921099466085, "mean_token_accuracy": 0.7941810488700867, "num_tokens": 25019033.0, "step": 4552, "train/ce_loss": 0.5657673478126526 }, { "epoch": 0.45006921099466085, "step": 4552, "train/sim_loss": 0.04296875 }, { "epoch": 0.45006921099466085, "step": 4552, "train/total_loss": 0.09954548627138138 }, { "entropy": 9.049863815307617, "epoch": 0.4501680838441764, "mean_token_accuracy": 0.7100792527198792, "num_tokens": 25024560.0, "step": 4553, "train/ce_loss": 0.7076995968818665 }, { "epoch": 0.4501680838441764, "step": 4553, "train/sim_loss": 0.07421875 }, { "epoch": 0.4501680838441764, "step": 4553, "train/total_loss": 0.14498871564865112 }, { "entropy": 8.958114624023438, "epoch": 0.45026695669369193, "mean_token_accuracy": 0.7936893105506897, "num_tokens": 25030041.0, "step": 4554, "train/ce_loss": 0.7299112677574158 }, { "epoch": 0.45026695669369193, "step": 4554, "train/sim_loss": 0.0390625 }, { "epoch": 0.45026695669369193, "step": 4554, "train/total_loss": 0.11205362528562546 }, { "entropy": 8.98991870880127, "epoch": 0.4503658295432074, "mean_token_accuracy": 0.7026666402816772, "num_tokens": 25035383.0, "step": 4555, "train/ce_loss": 0.5859864354133606 }, { "epoch": 0.4503658295432074, "step": 4555, "train/sim_loss": 0.0546875 }, { "epoch": 0.4503658295432074, "step": 4555, "train/total_loss": 0.11328614503145218 }, { "entropy": 9.136025428771973, "epoch": 0.45046470239272296, "mean_token_accuracy": 0.7423529624938965, "num_tokens": 25040835.0, "step": 4556, "train/ce_loss": 1.1432976722717285 }, { "epoch": 0.45046470239272296, "step": 4556, "train/sim_loss": 0.09765625 }, { "epoch": 0.45046470239272296, "step": 4556, "train/total_loss": 0.2119860202074051 }, { "entropy": 8.747345924377441, "epoch": 0.4505635752422385, "mean_token_accuracy": 0.7695605754852295, "num_tokens": 25046453.0, "step": 4557, "train/ce_loss": 0.35601767897605896 }, { "epoch": 0.4505635752422385, "step": 4557, "train/sim_loss": 0.0234375 }, { "epoch": 0.4505635752422385, "step": 4557, "train/total_loss": 0.059039268642663956 }, { "entropy": 8.912171363830566, "epoch": 0.450662448091754, "mean_token_accuracy": 0.7417893409729004, "num_tokens": 25051930.0, "step": 4558, "train/ce_loss": 0.37406882643699646 }, { "epoch": 0.450662448091754, "step": 4558, "train/sim_loss": 0.01171875 }, { "epoch": 0.450662448091754, "step": 4558, "train/total_loss": 0.049125634133815765 }, { "entropy": 8.450071334838867, "epoch": 0.4507613209412695, "mean_token_accuracy": 0.7899628281593323, "num_tokens": 25057691.0, "step": 4559, "train/ce_loss": 0.8198689222335815 }, { "epoch": 0.4507613209412695, "step": 4559, "train/sim_loss": 0.02734375 }, { "epoch": 0.4507613209412695, "step": 4559, "train/total_loss": 0.10933064669370651 }, { "epoch": 0.45086019379078507, "grad_norm": 0.5943648815155029, "learning_rate": 8.87529051080453e-06, "loss": 0.1364, "step": 4560 }, { "entropy": 8.584505081176758, "epoch": 0.45086019379078507, "mean_token_accuracy": 0.7945454716682434, "num_tokens": 25063482.0, "step": 4560, "train/ce_loss": 0.4463265538215637 }, { "epoch": 0.45086019379078507, "step": 4560, "train/sim_loss": 0.01953125 }, { "epoch": 0.45086019379078507, "step": 4560, "train/total_loss": 0.06416390836238861 }, { "entropy": 8.90808391571045, "epoch": 0.45095906664030055, "mean_token_accuracy": 0.7863741517066956, "num_tokens": 25068970.0, "step": 4561, "train/ce_loss": 0.8555310964584351 }, { "epoch": 0.45095906664030055, "step": 4561, "train/sim_loss": 0.02734375 }, { "epoch": 0.45095906664030055, "step": 4561, "train/total_loss": 0.1128968596458435 }, { "entropy": 9.19841194152832, "epoch": 0.4510579394898161, "mean_token_accuracy": 0.7173295617103577, "num_tokens": 25074287.0, "step": 4562, "train/ce_loss": 0.7104247808456421 }, { "epoch": 0.4510579394898161, "step": 4562, "train/sim_loss": 0.0625 }, { "epoch": 0.4510579394898161, "step": 4562, "train/total_loss": 0.1335424780845642 }, { "entropy": 8.99036693572998, "epoch": 0.45115681233933164, "mean_token_accuracy": 0.8055555820465088, "num_tokens": 25079789.0, "step": 4563, "train/ce_loss": 0.5889111757278442 }, { "epoch": 0.45115681233933164, "step": 4563, "train/sim_loss": 0.08203125 }, { "epoch": 0.45115681233933164, "step": 4563, "train/total_loss": 0.14092236757278442 }, { "entropy": 9.151470184326172, "epoch": 0.4512556851888471, "mean_token_accuracy": 0.6932367086410522, "num_tokens": 25085280.0, "step": 4564, "train/ce_loss": 1.6696391105651855 }, { "epoch": 0.4512556851888471, "step": 4564, "train/sim_loss": 0.03125 }, { "epoch": 0.4512556851888471, "step": 4564, "train/total_loss": 0.19821391999721527 }, { "entropy": 8.929697036743164, "epoch": 0.45135455803836266, "mean_token_accuracy": 0.7305629849433899, "num_tokens": 25090633.0, "step": 4565, "train/ce_loss": 0.4621365964412689 }, { "epoch": 0.45135455803836266, "step": 4565, "train/sim_loss": 0.0234375 }, { "epoch": 0.45135455803836266, "step": 4565, "train/total_loss": 0.06965115666389465 }, { "entropy": 8.780500411987305, "epoch": 0.4514534308878782, "mean_token_accuracy": 0.7286166548728943, "num_tokens": 25096237.0, "step": 4566, "train/ce_loss": 0.8438911437988281 }, { "epoch": 0.4514534308878782, "step": 4566, "train/sim_loss": 0.0625 }, { "epoch": 0.4514534308878782, "step": 4566, "train/total_loss": 0.1468891203403473 }, { "entropy": 8.996326446533203, "epoch": 0.4515523037373937, "mean_token_accuracy": 0.7930648922920227, "num_tokens": 25101665.0, "step": 4567, "train/ce_loss": 0.56477290391922 }, { "epoch": 0.4515523037373937, "step": 4567, "train/sim_loss": 0.046875 }, { "epoch": 0.4515523037373937, "step": 4567, "train/total_loss": 0.10335229337215424 }, { "entropy": 8.688821792602539, "epoch": 0.45165117658690923, "mean_token_accuracy": 0.7170370221138, "num_tokens": 25106978.0, "step": 4568, "train/ce_loss": 1.6860970258712769 }, { "epoch": 0.45165117658690923, "step": 4568, "train/sim_loss": 0.0859375 }, { "epoch": 0.45165117658690923, "step": 4568, "train/total_loss": 0.25454720854759216 }, { "entropy": 8.927635192871094, "epoch": 0.45175004943642477, "mean_token_accuracy": 0.7525773048400879, "num_tokens": 25112463.0, "step": 4569, "train/ce_loss": 1.1832995414733887 }, { "epoch": 0.45175004943642477, "step": 4569, "train/sim_loss": 0.0625 }, { "epoch": 0.45175004943642477, "step": 4569, "train/total_loss": 0.1808299571275711 }, { "entropy": 9.296575546264648, "epoch": 0.45184892228594026, "mean_token_accuracy": 0.7576601505279541, "num_tokens": 25117812.0, "step": 4570, "train/ce_loss": 0.7493181824684143 }, { "epoch": 0.45184892228594026, "step": 4570, "train/sim_loss": 0.09375 }, { "epoch": 0.45184892228594026, "step": 4570, "train/total_loss": 0.16868183016777039 }, { "entropy": 8.92693042755127, "epoch": 0.4519477951354558, "mean_token_accuracy": 0.7235682606697083, "num_tokens": 25123342.0, "step": 4571, "train/ce_loss": 0.752141535282135 }, { "epoch": 0.4519477951354558, "step": 4571, "train/sim_loss": 0.046875 }, { "epoch": 0.4519477951354558, "step": 4571, "train/total_loss": 0.12208915501832962 }, { "entropy": 8.781251907348633, "epoch": 0.45204666798497134, "mean_token_accuracy": 0.7059447765350342, "num_tokens": 25128910.0, "step": 4572, "train/ce_loss": 0.8504990935325623 }, { "epoch": 0.45204666798497134, "step": 4572, "train/sim_loss": 0.08203125 }, { "epoch": 0.45204666798497134, "step": 4572, "train/total_loss": 0.16708116233348846 }, { "entropy": 9.129827499389648, "epoch": 0.4521455408344868, "mean_token_accuracy": 0.7874692678451538, "num_tokens": 25134306.0, "step": 4573, "train/ce_loss": 0.7048805952072144 }, { "epoch": 0.4521455408344868, "step": 4573, "train/sim_loss": 0.05078125 }, { "epoch": 0.4521455408344868, "step": 4573, "train/total_loss": 0.12126930803060532 }, { "entropy": 8.925529479980469, "epoch": 0.45224441368400237, "mean_token_accuracy": 0.7806004881858826, "num_tokens": 25139785.0, "step": 4574, "train/ce_loss": 0.5105215907096863 }, { "epoch": 0.45224441368400237, "step": 4574, "train/sim_loss": 0.02734375 }, { "epoch": 0.45224441368400237, "step": 4574, "train/total_loss": 0.07839591056108475 }, { "entropy": 9.09764575958252, "epoch": 0.4523432865335179, "mean_token_accuracy": 0.761904776096344, "num_tokens": 25145043.0, "step": 4575, "train/ce_loss": 1.0284522771835327 }, { "epoch": 0.4523432865335179, "step": 4575, "train/sim_loss": 0.140625 }, { "epoch": 0.4523432865335179, "step": 4575, "train/total_loss": 0.2434702217578888 }, { "entropy": 8.796012878417969, "epoch": 0.4524421593830334, "mean_token_accuracy": 0.7506234645843506, "num_tokens": 25150525.0, "step": 4576, "train/ce_loss": 0.7954899072647095 }, { "epoch": 0.4524421593830334, "step": 4576, "train/sim_loss": 0.046875 }, { "epoch": 0.4524421593830334, "step": 4576, "train/total_loss": 0.12642398476600647 }, { "entropy": 8.573749542236328, "epoch": 0.45254103223254893, "mean_token_accuracy": 0.7438370585441589, "num_tokens": 25156035.0, "step": 4577, "train/ce_loss": 0.6305662393569946 }, { "epoch": 0.45254103223254893, "step": 4577, "train/sim_loss": 0.02734375 }, { "epoch": 0.45254103223254893, "step": 4577, "train/total_loss": 0.09040037542581558 }, { "entropy": 8.874002456665039, "epoch": 0.4526399050820645, "mean_token_accuracy": 0.7356557250022888, "num_tokens": 25161662.0, "step": 4578, "train/ce_loss": 1.078852653503418 }, { "epoch": 0.4526399050820645, "step": 4578, "train/sim_loss": 0.0859375 }, { "epoch": 0.4526399050820645, "step": 4578, "train/total_loss": 0.19382277131080627 }, { "entropy": 9.114799499511719, "epoch": 0.45273877793157996, "mean_token_accuracy": 0.7325408458709717, "num_tokens": 25166899.0, "step": 4579, "train/ce_loss": 0.600253164768219 }, { "epoch": 0.45273877793157996, "step": 4579, "train/sim_loss": 0.0390625 }, { "epoch": 0.45273877793157996, "step": 4579, "train/total_loss": 0.09908781945705414 }, { "epoch": 0.4528376507810955, "grad_norm": 0.7141582369804382, "learning_rate": 8.870345646046581e-06, "loss": 0.1343, "step": 4580 }, { "entropy": 8.529949188232422, "epoch": 0.4528376507810955, "mean_token_accuracy": 0.7925724387168884, "num_tokens": 25172718.0, "step": 4580, "train/ce_loss": 0.5497293472290039 }, { "epoch": 0.4528376507810955, "step": 4580, "train/sim_loss": 0.09375 }, { "epoch": 0.4528376507810955, "step": 4580, "train/total_loss": 0.14872293174266815 }, { "entropy": 9.055719375610352, "epoch": 0.45293652363061104, "mean_token_accuracy": 0.7582562565803528, "num_tokens": 25178074.0, "step": 4581, "train/ce_loss": 1.051128625869751 }, { "epoch": 0.45293652363061104, "step": 4581, "train/sim_loss": 0.03515625 }, { "epoch": 0.45293652363061104, "step": 4581, "train/total_loss": 0.14026911556720734 }, { "entropy": 9.064311981201172, "epoch": 0.4530353964801266, "mean_token_accuracy": 0.7445339560508728, "num_tokens": 25183602.0, "step": 4582, "train/ce_loss": 0.800736129283905 }, { "epoch": 0.4530353964801266, "step": 4582, "train/sim_loss": 0.05859375 }, { "epoch": 0.4530353964801266, "step": 4582, "train/total_loss": 0.13866737484931946 }, { "entropy": 9.166340827941895, "epoch": 0.45313426932964207, "mean_token_accuracy": 0.7419354915618896, "num_tokens": 25188903.0, "step": 4583, "train/ce_loss": 0.7888665795326233 }, { "epoch": 0.45313426932964207, "step": 4583, "train/sim_loss": 0.04296875 }, { "epoch": 0.45313426932964207, "step": 4583, "train/total_loss": 0.12185540795326233 }, { "entropy": 8.938203811645508, "epoch": 0.4532331421791576, "mean_token_accuracy": 0.7841044068336487, "num_tokens": 25194322.0, "step": 4584, "train/ce_loss": 0.5928264856338501 }, { "epoch": 0.4532331421791576, "step": 4584, "train/sim_loss": 0.03515625 }, { "epoch": 0.4532331421791576, "step": 4584, "train/total_loss": 0.09443889558315277 }, { "entropy": 8.792181968688965, "epoch": 0.45333201502867315, "mean_token_accuracy": 0.7171825170516968, "num_tokens": 25199891.0, "step": 4585, "train/ce_loss": 0.5308801531791687 }, { "epoch": 0.45333201502867315, "step": 4585, "train/sim_loss": 0.05859375 }, { "epoch": 0.45333201502867315, "step": 4585, "train/total_loss": 0.11168176680803299 }, { "entropy": 8.79084587097168, "epoch": 0.45343088787818864, "mean_token_accuracy": 0.7444196343421936, "num_tokens": 25205398.0, "step": 4586, "train/ce_loss": 0.7083831429481506 }, { "epoch": 0.45343088787818864, "step": 4586, "train/sim_loss": 0.03515625 }, { "epoch": 0.45343088787818864, "step": 4586, "train/total_loss": 0.1059945672750473 }, { "entropy": 9.132658004760742, "epoch": 0.4535297607277042, "mean_token_accuracy": 0.7605095505714417, "num_tokens": 25210776.0, "step": 4587, "train/ce_loss": 0.9067718982696533 }, { "epoch": 0.4535297607277042, "step": 4587, "train/sim_loss": 0.1328125 }, { "epoch": 0.4535297607277042, "step": 4587, "train/total_loss": 0.2234897017478943 }, { "entropy": 8.575628280639648, "epoch": 0.4536286335772197, "mean_token_accuracy": 0.7567567825317383, "num_tokens": 25216472.0, "step": 4588, "train/ce_loss": 0.7787726521492004 }, { "epoch": 0.4536286335772197, "step": 4588, "train/sim_loss": 0.0390625 }, { "epoch": 0.4536286335772197, "step": 4588, "train/total_loss": 0.11693976819515228 }, { "entropy": 9.182966232299805, "epoch": 0.4537275064267352, "mean_token_accuracy": 0.7869565486907959, "num_tokens": 25221800.0, "step": 4589, "train/ce_loss": 0.434343159198761 }, { "epoch": 0.4537275064267352, "step": 4589, "train/sim_loss": 0.0234375 }, { "epoch": 0.4537275064267352, "step": 4589, "train/total_loss": 0.06687182188034058 }, { "entropy": 9.096391677856445, "epoch": 0.45382637927625075, "mean_token_accuracy": 0.7867231369018555, "num_tokens": 25227118.0, "step": 4590, "train/ce_loss": 0.516947329044342 }, { "epoch": 0.45382637927625075, "step": 4590, "train/sim_loss": 0.09765625 }, { "epoch": 0.45382637927625075, "step": 4590, "train/total_loss": 0.14935098588466644 }, { "entropy": 8.683180809020996, "epoch": 0.4539252521257663, "mean_token_accuracy": 0.746065080165863, "num_tokens": 25232656.0, "step": 4591, "train/ce_loss": 1.1565958261489868 }, { "epoch": 0.4539252521257663, "step": 4591, "train/sim_loss": 0.0703125 }, { "epoch": 0.4539252521257663, "step": 4591, "train/total_loss": 0.18597209453582764 }, { "entropy": 8.694950103759766, "epoch": 0.4540241249752818, "mean_token_accuracy": 0.7318435907363892, "num_tokens": 25238243.0, "step": 4592, "train/ce_loss": 0.8018629550933838 }, { "epoch": 0.4540241249752818, "step": 4592, "train/sim_loss": 0.0546875 }, { "epoch": 0.4540241249752818, "step": 4592, "train/total_loss": 0.13487380743026733 }, { "entropy": 8.998732566833496, "epoch": 0.4541229978247973, "mean_token_accuracy": 0.7727272510528564, "num_tokens": 25243693.0, "step": 4593, "train/ce_loss": 0.5778375267982483 }, { "epoch": 0.4541229978247973, "step": 4593, "train/sim_loss": 0.0234375 }, { "epoch": 0.4541229978247973, "step": 4593, "train/total_loss": 0.08122125267982483 }, { "entropy": 8.434696197509766, "epoch": 0.45422187067431286, "mean_token_accuracy": 0.7398523688316345, "num_tokens": 25249396.0, "step": 4594, "train/ce_loss": 1.0108569860458374 }, { "epoch": 0.45422187067431286, "step": 4594, "train/sim_loss": 0.04296875 }, { "epoch": 0.45422187067431286, "step": 4594, "train/total_loss": 0.14405444264411926 }, { "entropy": 9.209203720092773, "epoch": 0.45432074352382834, "mean_token_accuracy": 0.7643051743507385, "num_tokens": 25254916.0, "step": 4595, "train/ce_loss": 0.83284592628479 }, { "epoch": 0.45432074352382834, "step": 4595, "train/sim_loss": 0.11328125 }, { "epoch": 0.45432074352382834, "step": 4595, "train/total_loss": 0.19656583666801453 }, { "entropy": 8.652363777160645, "epoch": 0.4544196163733439, "mean_token_accuracy": 0.7316561937332153, "num_tokens": 25260495.0, "step": 4596, "train/ce_loss": 1.0214430093765259 }, { "epoch": 0.4544196163733439, "step": 4596, "train/sim_loss": 0.09375 }, { "epoch": 0.4544196163733439, "step": 4596, "train/total_loss": 0.1958943009376526 }, { "entropy": 8.72352409362793, "epoch": 0.4545184892228594, "mean_token_accuracy": 0.7426470518112183, "num_tokens": 25266045.0, "step": 4597, "train/ce_loss": 0.7090005278587341 }, { "epoch": 0.4545184892228594, "step": 4597, "train/sim_loss": 0.0859375 }, { "epoch": 0.4545184892228594, "step": 4597, "train/total_loss": 0.1568375527858734 }, { "entropy": 8.834339141845703, "epoch": 0.4546173620723749, "mean_token_accuracy": 0.707317054271698, "num_tokens": 25271461.0, "step": 4598, "train/ce_loss": 1.0892080068588257 }, { "epoch": 0.4546173620723749, "step": 4598, "train/sim_loss": 0.09375 }, { "epoch": 0.4546173620723749, "step": 4598, "train/total_loss": 0.20267081260681152 }, { "entropy": 8.658650398254395, "epoch": 0.45471623492189045, "mean_token_accuracy": 0.7605459094047546, "num_tokens": 25276948.0, "step": 4599, "train/ce_loss": 0.43696025013923645 }, { "epoch": 0.45471623492189045, "step": 4599, "train/sim_loss": 0.04296875 }, { "epoch": 0.45471623492189045, "step": 4599, "train/total_loss": 0.08666478097438812 }, { "epoch": 0.454815107771406, "grad_norm": 0.7236211895942688, "learning_rate": 8.865400781288632e-06, "loss": 0.1358, "step": 4600 }, { "entropy": 8.51458740234375, "epoch": 0.454815107771406, "mean_token_accuracy": 0.7605504393577576, "num_tokens": 25282567.0, "step": 4600, "train/ce_loss": 0.6094155311584473 }, { "epoch": 0.454815107771406, "step": 4600, "train/sim_loss": 0.05078125 }, { "epoch": 0.454815107771406, "step": 4600, "train/total_loss": 0.11172280460596085 }, { "entropy": 8.81161880493164, "epoch": 0.4549139806209215, "mean_token_accuracy": 0.7313084006309509, "num_tokens": 25288096.0, "step": 4601, "train/ce_loss": 1.0146642923355103 }, { "epoch": 0.4549139806209215, "step": 4601, "train/sim_loss": 0.06640625 }, { "epoch": 0.4549139806209215, "step": 4601, "train/total_loss": 0.16787268221378326 }, { "entropy": 9.009737014770508, "epoch": 0.455012853470437, "mean_token_accuracy": 0.7191780805587769, "num_tokens": 25293451.0, "step": 4602, "train/ce_loss": 1.0129436254501343 }, { "epoch": 0.455012853470437, "step": 4602, "train/sim_loss": 0.08984375 }, { "epoch": 0.455012853470437, "step": 4602, "train/total_loss": 0.1911381185054779 }, { "entropy": 9.14792537689209, "epoch": 0.45511172631995256, "mean_token_accuracy": 0.7233748435974121, "num_tokens": 25298789.0, "step": 4603, "train/ce_loss": 1.7093623876571655 }, { "epoch": 0.45511172631995256, "step": 4603, "train/sim_loss": 0.0859375 }, { "epoch": 0.45511172631995256, "step": 4603, "train/total_loss": 0.2568737268447876 }, { "entropy": 8.956989288330078, "epoch": 0.45521059916946804, "mean_token_accuracy": 0.7672811150550842, "num_tokens": 25304241.0, "step": 4604, "train/ce_loss": 1.0834165811538696 }, { "epoch": 0.45521059916946804, "step": 4604, "train/sim_loss": 0.0546875 }, { "epoch": 0.45521059916946804, "step": 4604, "train/total_loss": 0.16302916407585144 }, { "entropy": 8.702672004699707, "epoch": 0.4553094720189836, "mean_token_accuracy": 0.673184335231781, "num_tokens": 25309997.0, "step": 4605, "train/ce_loss": 1.161177396774292 }, { "epoch": 0.4553094720189836, "step": 4605, "train/sim_loss": 0.1015625 }, { "epoch": 0.4553094720189836, "step": 4605, "train/total_loss": 0.21768024563789368 }, { "entropy": 8.734203338623047, "epoch": 0.4554083448684991, "mean_token_accuracy": 0.7894737124443054, "num_tokens": 25315460.0, "step": 4606, "train/ce_loss": 0.957352340221405 }, { "epoch": 0.4554083448684991, "step": 4606, "train/sim_loss": 0.0625 }, { "epoch": 0.4554083448684991, "step": 4606, "train/total_loss": 0.15823523700237274 }, { "entropy": 8.801831245422363, "epoch": 0.4555072177180146, "mean_token_accuracy": 0.6918714642524719, "num_tokens": 25321054.0, "step": 4607, "train/ce_loss": 1.0463972091674805 }, { "epoch": 0.4555072177180146, "step": 4607, "train/sim_loss": 0.0546875 }, { "epoch": 0.4555072177180146, "step": 4607, "train/total_loss": 0.15932722389698029 }, { "entropy": 8.949773788452148, "epoch": 0.45560609056753015, "mean_token_accuracy": 0.7943262457847595, "num_tokens": 25326489.0, "step": 4608, "train/ce_loss": 0.8063943386077881 }, { "epoch": 0.45560609056753015, "step": 4608, "train/sim_loss": 0.05859375 }, { "epoch": 0.45560609056753015, "step": 4608, "train/total_loss": 0.13923318684101105 }, { "entropy": 8.932669639587402, "epoch": 0.4557049634170457, "mean_token_accuracy": 0.7849944233894348, "num_tokens": 25331995.0, "step": 4609, "train/ce_loss": 0.680107593536377 }, { "epoch": 0.4557049634170457, "step": 4609, "train/sim_loss": 0.1015625 }, { "epoch": 0.4557049634170457, "step": 4609, "train/total_loss": 0.16957326233386993 }, { "entropy": 8.778423309326172, "epoch": 0.4558038362665612, "mean_token_accuracy": 0.7144308686256409, "num_tokens": 25337679.0, "step": 4610, "train/ce_loss": 0.7123983502388 }, { "epoch": 0.4558038362665612, "step": 4610, "train/sim_loss": 0.078125 }, { "epoch": 0.4558038362665612, "step": 4610, "train/total_loss": 0.14936482906341553 }, { "entropy": 8.885505676269531, "epoch": 0.4559027091160767, "mean_token_accuracy": 0.7121848464012146, "num_tokens": 25343221.0, "step": 4611, "train/ce_loss": 0.7197675108909607 }, { "epoch": 0.4559027091160767, "step": 4611, "train/sim_loss": 0.06640625 }, { "epoch": 0.4559027091160767, "step": 4611, "train/total_loss": 0.13838300108909607 }, { "entropy": 9.044529914855957, "epoch": 0.45600158196559226, "mean_token_accuracy": 0.7297618985176086, "num_tokens": 25348706.0, "step": 4612, "train/ce_loss": 0.9334263205528259 }, { "epoch": 0.45600158196559226, "step": 4612, "train/sim_loss": 0.04296875 }, { "epoch": 0.45600158196559226, "step": 4612, "train/total_loss": 0.1363113820552826 }, { "entropy": 8.614754676818848, "epoch": 0.45610045481510775, "mean_token_accuracy": 0.760577917098999, "num_tokens": 25354201.0, "step": 4613, "train/ce_loss": 0.40410467982292175 }, { "epoch": 0.45610045481510775, "step": 4613, "train/sim_loss": 0.05078125 }, { "epoch": 0.45610045481510775, "step": 4613, "train/total_loss": 0.09119172394275665 }, { "entropy": 8.706933975219727, "epoch": 0.4561993276646233, "mean_token_accuracy": 0.7141350507736206, "num_tokens": 25359716.0, "step": 4614, "train/ce_loss": 0.8354523181915283 }, { "epoch": 0.4561993276646233, "step": 4614, "train/sim_loss": 0.03515625 }, { "epoch": 0.4561993276646233, "step": 4614, "train/total_loss": 0.11870148032903671 }, { "entropy": 8.971240997314453, "epoch": 0.45629820051413883, "mean_token_accuracy": 0.7662650346755981, "num_tokens": 25365191.0, "step": 4615, "train/ce_loss": 0.8883122801780701 }, { "epoch": 0.45629820051413883, "step": 4615, "train/sim_loss": 0.05078125 }, { "epoch": 0.45629820051413883, "step": 4615, "train/total_loss": 0.13961248099803925 }, { "entropy": 9.044570922851562, "epoch": 0.4563970733636543, "mean_token_accuracy": 0.7488038539886475, "num_tokens": 25370576.0, "step": 4616, "train/ce_loss": 0.6470237374305725 }, { "epoch": 0.4563970733636543, "step": 4616, "train/sim_loss": 0.0625 }, { "epoch": 0.4563970733636543, "step": 4616, "train/total_loss": 0.1272023767232895 }, { "entropy": 8.685352325439453, "epoch": 0.45649594621316986, "mean_token_accuracy": 0.7958412170410156, "num_tokens": 25376281.0, "step": 4617, "train/ce_loss": 1.512109637260437 }, { "epoch": 0.45649594621316986, "step": 4617, "train/sim_loss": 0.0703125 }, { "epoch": 0.45649594621316986, "step": 4617, "train/total_loss": 0.2215234637260437 }, { "entropy": 8.414947509765625, "epoch": 0.4565948190626854, "mean_token_accuracy": 0.7381545901298523, "num_tokens": 25381640.0, "step": 4618, "train/ce_loss": 0.6459738612174988 }, { "epoch": 0.4565948190626854, "step": 4618, "train/sim_loss": 0.05078125 }, { "epoch": 0.4565948190626854, "step": 4618, "train/total_loss": 0.11537864059209824 }, { "entropy": 8.99242877960205, "epoch": 0.4566936919122009, "mean_token_accuracy": 0.7122473120689392, "num_tokens": 25387011.0, "step": 4619, "train/ce_loss": 0.4913102090358734 }, { "epoch": 0.4566936919122009, "step": 4619, "train/sim_loss": 0.01171875 }, { "epoch": 0.4566936919122009, "step": 4619, "train/total_loss": 0.06084977090358734 }, { "epoch": 0.4567925647617164, "grad_norm": 0.6787588596343994, "learning_rate": 8.860455916530684e-06, "loss": 0.1401, "step": 4620 }, { "entropy": 8.938409805297852, "epoch": 0.4567925647617164, "mean_token_accuracy": 0.7728426456451416, "num_tokens": 25392411.0, "step": 4620, "train/ce_loss": 0.6055096387863159 }, { "epoch": 0.4567925647617164, "step": 4620, "train/sim_loss": 0.05078125 }, { "epoch": 0.4567925647617164, "step": 4620, "train/total_loss": 0.11133221536874771 }, { "entropy": 8.89828872680664, "epoch": 0.45689143761123197, "mean_token_accuracy": 0.7371364831924438, "num_tokens": 25397900.0, "step": 4621, "train/ce_loss": 0.4574694335460663 }, { "epoch": 0.45689143761123197, "step": 4621, "train/sim_loss": 0.03125 }, { "epoch": 0.45689143761123197, "step": 4621, "train/total_loss": 0.07699694484472275 }, { "entropy": 8.746685981750488, "epoch": 0.4569903104607475, "mean_token_accuracy": 0.7985829710960388, "num_tokens": 25403517.0, "step": 4622, "train/ce_loss": 0.41552671790122986 }, { "epoch": 0.4569903104607475, "step": 4622, "train/sim_loss": 0.046875 }, { "epoch": 0.4569903104607475, "step": 4622, "train/total_loss": 0.08842767775058746 }, { "entropy": 9.014689445495605, "epoch": 0.457089183310263, "mean_token_accuracy": 0.7490445971488953, "num_tokens": 25408878.0, "step": 4623, "train/ce_loss": 0.9857243299484253 }, { "epoch": 0.457089183310263, "step": 4623, "train/sim_loss": 0.03515625 }, { "epoch": 0.457089183310263, "step": 4623, "train/total_loss": 0.13372868299484253 }, { "entropy": 8.907308578491211, "epoch": 0.45718805615977853, "mean_token_accuracy": 0.7744185924530029, "num_tokens": 25414252.0, "step": 4624, "train/ce_loss": 0.7652708292007446 }, { "epoch": 0.45718805615977853, "step": 4624, "train/sim_loss": 0.05078125 }, { "epoch": 0.45718805615977853, "step": 4624, "train/total_loss": 0.12730833888053894 }, { "entropy": 9.327630996704102, "epoch": 0.4572869290092941, "mean_token_accuracy": 0.7603626847267151, "num_tokens": 25419631.0, "step": 4625, "train/ce_loss": 1.1175285577774048 }, { "epoch": 0.4572869290092941, "step": 4625, "train/sim_loss": 0.078125 }, { "epoch": 0.4572869290092941, "step": 4625, "train/total_loss": 0.18987786769866943 }, { "entropy": 9.060724258422852, "epoch": 0.45738580185880956, "mean_token_accuracy": 0.7297297120094299, "num_tokens": 25425134.0, "step": 4626, "train/ce_loss": 1.6294738054275513 }, { "epoch": 0.45738580185880956, "step": 4626, "train/sim_loss": 0.0390625 }, { "epoch": 0.45738580185880956, "step": 4626, "train/total_loss": 0.2020098865032196 }, { "entropy": 8.417543411254883, "epoch": 0.4574846747083251, "mean_token_accuracy": 0.7379032373428345, "num_tokens": 25430777.0, "step": 4627, "train/ce_loss": 0.9446748495101929 }, { "epoch": 0.4574846747083251, "step": 4627, "train/sim_loss": 0.0859375 }, { "epoch": 0.4574846747083251, "step": 4627, "train/total_loss": 0.18040499091148376 }, { "entropy": 9.228109359741211, "epoch": 0.45758354755784064, "mean_token_accuracy": 0.74631267786026, "num_tokens": 25436086.0, "step": 4628, "train/ce_loss": 0.9453728199005127 }, { "epoch": 0.45758354755784064, "step": 4628, "train/sim_loss": 0.0546875 }, { "epoch": 0.45758354755784064, "step": 4628, "train/total_loss": 0.14922478795051575 }, { "entropy": 8.66535758972168, "epoch": 0.4576824204073561, "mean_token_accuracy": 0.7074363827705383, "num_tokens": 25441711.0, "step": 4629, "train/ce_loss": 1.2778722047805786 }, { "epoch": 0.4576824204073561, "step": 4629, "train/sim_loss": 0.08984375 }, { "epoch": 0.4576824204073561, "step": 4629, "train/total_loss": 0.21763096749782562 }, { "entropy": 8.860687255859375, "epoch": 0.45778129325687167, "mean_token_accuracy": 0.7219570279121399, "num_tokens": 25447181.0, "step": 4630, "train/ce_loss": 0.5432073473930359 }, { "epoch": 0.45778129325687167, "step": 4630, "train/sim_loss": 0.06640625 }, { "epoch": 0.45778129325687167, "step": 4630, "train/total_loss": 0.12072698771953583 }, { "entropy": 8.880346298217773, "epoch": 0.4578801661063872, "mean_token_accuracy": 0.6639722585678101, "num_tokens": 25452615.0, "step": 4631, "train/ce_loss": 1.570489525794983 }, { "epoch": 0.4578801661063872, "step": 4631, "train/sim_loss": 0.12109375 }, { "epoch": 0.4578801661063872, "step": 4631, "train/total_loss": 0.27814269065856934 }, { "entropy": 8.481561660766602, "epoch": 0.4579790389559027, "mean_token_accuracy": 0.7666051387786865, "num_tokens": 25458287.0, "step": 4632, "train/ce_loss": 0.467759370803833 }, { "epoch": 0.4579790389559027, "step": 4632, "train/sim_loss": 0.02734375 }, { "epoch": 0.4579790389559027, "step": 4632, "train/total_loss": 0.0741196870803833 }, { "entropy": 8.511577606201172, "epoch": 0.45807791180541824, "mean_token_accuracy": 0.7488908767700195, "num_tokens": 25464063.0, "step": 4633, "train/ce_loss": 1.019562840461731 }, { "epoch": 0.45807791180541824, "step": 4633, "train/sim_loss": 0.0625 }, { "epoch": 0.45807791180541824, "step": 4633, "train/total_loss": 0.16445627808570862 }, { "entropy": 9.138311386108398, "epoch": 0.4581767846549338, "mean_token_accuracy": 0.699999988079071, "num_tokens": 25469504.0, "step": 4634, "train/ce_loss": 0.7790408134460449 }, { "epoch": 0.4581767846549338, "step": 4634, "train/sim_loss": 0.06640625 }, { "epoch": 0.4581767846549338, "step": 4634, "train/total_loss": 0.14431032538414001 }, { "entropy": 8.815467834472656, "epoch": 0.45827565750444926, "mean_token_accuracy": 0.7181926369667053, "num_tokens": 25474914.0, "step": 4635, "train/ce_loss": 0.8466480374336243 }, { "epoch": 0.45827565750444926, "step": 4635, "train/sim_loss": 0.02734375 }, { "epoch": 0.45827565750444926, "step": 4635, "train/total_loss": 0.11200855672359467 }, { "entropy": 8.868912696838379, "epoch": 0.4583745303539648, "mean_token_accuracy": 0.760221004486084, "num_tokens": 25480446.0, "step": 4636, "train/ce_loss": 0.7298218607902527 }, { "epoch": 0.4583745303539648, "step": 4636, "train/sim_loss": 0.046875 }, { "epoch": 0.4583745303539648, "step": 4636, "train/total_loss": 0.11985718458890915 }, { "entropy": 8.910648345947266, "epoch": 0.45847340320348035, "mean_token_accuracy": 0.7231800556182861, "num_tokens": 25486045.0, "step": 4637, "train/ce_loss": 0.5913619995117188 }, { "epoch": 0.45847340320348035, "step": 4637, "train/sim_loss": 0.046875 }, { "epoch": 0.45847340320348035, "step": 4637, "train/total_loss": 0.10601119697093964 }, { "entropy": 8.657459259033203, "epoch": 0.45857227605299583, "mean_token_accuracy": 0.7287522554397583, "num_tokens": 25491718.0, "step": 4638, "train/ce_loss": 0.5734972953796387 }, { "epoch": 0.45857227605299583, "step": 4638, "train/sim_loss": 0.01953125 }, { "epoch": 0.45857227605299583, "step": 4638, "train/total_loss": 0.07688097655773163 }, { "entropy": 8.496492385864258, "epoch": 0.45867114890251137, "mean_token_accuracy": 0.7019748687744141, "num_tokens": 25497478.0, "step": 4639, "train/ce_loss": 0.4672516882419586 }, { "epoch": 0.45867114890251137, "step": 4639, "train/sim_loss": 0.06640625 }, { "epoch": 0.45867114890251137, "step": 4639, "train/total_loss": 0.11313141882419586 }, { "epoch": 0.4587700217520269, "grad_norm": 0.8635823726654053, "learning_rate": 8.855511051772734e-06, "loss": 0.1362, "step": 4640 }, { "entropy": 9.305654525756836, "epoch": 0.4587700217520269, "mean_token_accuracy": 0.7367724776268005, "num_tokens": 25502781.0, "step": 4640, "train/ce_loss": 0.8137586712837219 }, { "epoch": 0.4587700217520269, "step": 4640, "train/sim_loss": 0.0546875 }, { "epoch": 0.4587700217520269, "step": 4640, "train/total_loss": 0.1360633671283722 }, { "entropy": 8.960813522338867, "epoch": 0.4588688946015424, "mean_token_accuracy": 0.7320261597633362, "num_tokens": 25508076.0, "step": 4641, "train/ce_loss": 0.7063106298446655 }, { "epoch": 0.4588688946015424, "step": 4641, "train/sim_loss": 0.046875 }, { "epoch": 0.4588688946015424, "step": 4641, "train/total_loss": 0.11750606447458267 }, { "entropy": 8.56010627746582, "epoch": 0.45896776745105794, "mean_token_accuracy": 0.709057629108429, "num_tokens": 25513820.0, "step": 4642, "train/ce_loss": 1.31353759765625 }, { "epoch": 0.45896776745105794, "step": 4642, "train/sim_loss": 0.14453125 }, { "epoch": 0.45896776745105794, "step": 4642, "train/total_loss": 0.2758850157260895 }, { "entropy": 9.012167930603027, "epoch": 0.4590666403005735, "mean_token_accuracy": 0.7700471878051758, "num_tokens": 25519285.0, "step": 4643, "train/ce_loss": 0.6511886119842529 }, { "epoch": 0.4590666403005735, "step": 4643, "train/sim_loss": 0.08203125 }, { "epoch": 0.4590666403005735, "step": 4643, "train/total_loss": 0.14715011417865753 }, { "entropy": 8.80374526977539, "epoch": 0.45916551315008897, "mean_token_accuracy": 0.7959617376327515, "num_tokens": 25524815.0, "step": 4644, "train/ce_loss": 0.6153544187545776 }, { "epoch": 0.45916551315008897, "step": 4644, "train/sim_loss": 0.06640625 }, { "epoch": 0.45916551315008897, "step": 4644, "train/total_loss": 0.12794169783592224 }, { "entropy": 8.495025634765625, "epoch": 0.4592643859996045, "mean_token_accuracy": 0.7286307215690613, "num_tokens": 25530551.0, "step": 4645, "train/ce_loss": 0.541894257068634 }, { "epoch": 0.4592643859996045, "step": 4645, "train/sim_loss": 0.0546875 }, { "epoch": 0.4592643859996045, "step": 4645, "train/total_loss": 0.10887692868709564 }, { "entropy": 8.737468719482422, "epoch": 0.45936325884912005, "mean_token_accuracy": 0.7264150977134705, "num_tokens": 25536053.0, "step": 4646, "train/ce_loss": 1.742655873298645 }, { "epoch": 0.45936325884912005, "step": 4646, "train/sim_loss": 0.0546875 }, { "epoch": 0.45936325884912005, "step": 4646, "train/total_loss": 0.22895309329032898 }, { "entropy": 8.948738098144531, "epoch": 0.45946213169863553, "mean_token_accuracy": 0.743852436542511, "num_tokens": 25541689.0, "step": 4647, "train/ce_loss": 0.4764406979084015 }, { "epoch": 0.45946213169863553, "step": 4647, "train/sim_loss": 0.0390625 }, { "epoch": 0.45946213169863553, "step": 4647, "train/total_loss": 0.08670657128095627 }, { "entropy": 8.671360969543457, "epoch": 0.4595610045481511, "mean_token_accuracy": 0.8338279128074646, "num_tokens": 25547319.0, "step": 4648, "train/ce_loss": 0.5315372943878174 }, { "epoch": 0.4595610045481511, "step": 4648, "train/sim_loss": 0.04296875 }, { "epoch": 0.4595610045481511, "step": 4648, "train/total_loss": 0.09612248092889786 }, { "entropy": 8.957210540771484, "epoch": 0.4596598773976666, "mean_token_accuracy": 0.7728983759880066, "num_tokens": 25552729.0, "step": 4649, "train/ce_loss": 0.7430455088615417 }, { "epoch": 0.4596598773976666, "step": 4649, "train/sim_loss": 0.02734375 }, { "epoch": 0.4596598773976666, "step": 4649, "train/total_loss": 0.10164830088615417 }, { "entropy": 8.9337158203125, "epoch": 0.4597587502471821, "mean_token_accuracy": 0.7418967485427856, "num_tokens": 25558143.0, "step": 4650, "train/ce_loss": 0.8138685822486877 }, { "epoch": 0.4597587502471821, "step": 4650, "train/sim_loss": 0.04296875 }, { "epoch": 0.4597587502471821, "step": 4650, "train/total_loss": 0.12435560673475266 }, { "entropy": 9.030529975891113, "epoch": 0.45985762309669764, "mean_token_accuracy": 0.7400419116020203, "num_tokens": 25563855.0, "step": 4651, "train/ce_loss": 1.0260827541351318 }, { "epoch": 0.45985762309669764, "step": 4651, "train/sim_loss": 0.06640625 }, { "epoch": 0.45985762309669764, "step": 4651, "train/total_loss": 0.16901452839374542 }, { "entropy": 9.124317169189453, "epoch": 0.4599564959462132, "mean_token_accuracy": 0.7426470518112183, "num_tokens": 25569325.0, "step": 4652, "train/ce_loss": 0.9064507484436035 }, { "epoch": 0.4599564959462132, "step": 4652, "train/sim_loss": 0.0703125 }, { "epoch": 0.4599564959462132, "step": 4652, "train/total_loss": 0.16095757484436035 }, { "entropy": 8.884060859680176, "epoch": 0.46005536879572867, "mean_token_accuracy": 0.8008255958557129, "num_tokens": 25574904.0, "step": 4653, "train/ce_loss": 0.5020211935043335 }, { "epoch": 0.46005536879572867, "step": 4653, "train/sim_loss": 0.0234375 }, { "epoch": 0.46005536879572867, "step": 4653, "train/total_loss": 0.07363961637020111 }, { "entropy": 8.713064193725586, "epoch": 0.4601542416452442, "mean_token_accuracy": 0.7065009474754333, "num_tokens": 25580495.0, "step": 4654, "train/ce_loss": 1.2107566595077515 }, { "epoch": 0.4601542416452442, "step": 4654, "train/sim_loss": 0.125 }, { "epoch": 0.4601542416452442, "step": 4654, "train/total_loss": 0.24607565999031067 }, { "entropy": 9.021772384643555, "epoch": 0.46025311449475975, "mean_token_accuracy": 0.7191011309623718, "num_tokens": 25585927.0, "step": 4655, "train/ce_loss": 0.47280171513557434 }, { "epoch": 0.46025311449475975, "step": 4655, "train/sim_loss": 0.10546875 }, { "epoch": 0.46025311449475975, "step": 4655, "train/total_loss": 0.1527489274740219 }, { "entropy": 9.0244140625, "epoch": 0.46035198734427524, "mean_token_accuracy": 0.7678571343421936, "num_tokens": 25591294.0, "step": 4656, "train/ce_loss": 0.9129823446273804 }, { "epoch": 0.46035198734427524, "step": 4656, "train/sim_loss": 0.02734375 }, { "epoch": 0.46035198734427524, "step": 4656, "train/total_loss": 0.11864198744297028 }, { "entropy": 9.065070152282715, "epoch": 0.4604508601937908, "mean_token_accuracy": 0.7475000023841858, "num_tokens": 25596679.0, "step": 4657, "train/ce_loss": 0.8842766284942627 }, { "epoch": 0.4604508601937908, "step": 4657, "train/sim_loss": 0.03515625 }, { "epoch": 0.4604508601937908, "step": 4657, "train/total_loss": 0.12358391284942627 }, { "entropy": 8.662723541259766, "epoch": 0.4605497330433063, "mean_token_accuracy": 0.6631016135215759, "num_tokens": 25602417.0, "step": 4658, "train/ce_loss": 2.047483444213867 }, { "epoch": 0.4605497330433063, "step": 4658, "train/sim_loss": 0.04296875 }, { "epoch": 0.4605497330433063, "step": 4658, "train/total_loss": 0.24771709740161896 }, { "entropy": 8.899425506591797, "epoch": 0.4606486058928218, "mean_token_accuracy": 0.7084308862686157, "num_tokens": 25607874.0, "step": 4659, "train/ce_loss": 0.9925220608711243 }, { "epoch": 0.4606486058928218, "step": 4659, "train/sim_loss": 0.0625 }, { "epoch": 0.4606486058928218, "step": 4659, "train/total_loss": 0.16175220906734467 }, { "epoch": 0.46074747874233735, "grad_norm": 0.8849987387657166, "learning_rate": 8.850566187014787e-06, "loss": 0.1358, "step": 4660 }, { "entropy": 8.614114761352539, "epoch": 0.46074747874233735, "mean_token_accuracy": 0.7207304239273071, "num_tokens": 25613425.0, "step": 4660, "train/ce_loss": 0.706288754940033 }, { "epoch": 0.46074747874233735, "step": 4660, "train/sim_loss": 0.0546875 }, { "epoch": 0.46074747874233735, "step": 4660, "train/total_loss": 0.12531638145446777 }, { "entropy": 8.781253814697266, "epoch": 0.4608463515918529, "mean_token_accuracy": 0.7967742085456848, "num_tokens": 25618929.0, "step": 4661, "train/ce_loss": 0.7759692072868347 }, { "epoch": 0.4608463515918529, "step": 4661, "train/sim_loss": 0.0859375 }, { "epoch": 0.4608463515918529, "step": 4661, "train/total_loss": 0.16353443264961243 }, { "entropy": 9.000541687011719, "epoch": 0.4609452244413684, "mean_token_accuracy": 0.7220779061317444, "num_tokens": 25624254.0, "step": 4662, "train/ce_loss": 0.6386998295783997 }, { "epoch": 0.4609452244413684, "step": 4662, "train/sim_loss": 0.03515625 }, { "epoch": 0.4609452244413684, "step": 4662, "train/total_loss": 0.09902623295783997 }, { "entropy": 8.971948623657227, "epoch": 0.4610440972908839, "mean_token_accuracy": 0.6768149733543396, "num_tokens": 25629718.0, "step": 4663, "train/ce_loss": 1.2646275758743286 }, { "epoch": 0.4610440972908839, "step": 4663, "train/sim_loss": 0.0546875 }, { "epoch": 0.4610440972908839, "step": 4663, "train/total_loss": 0.18115025758743286 }, { "entropy": 8.85697078704834, "epoch": 0.46114297014039946, "mean_token_accuracy": 0.7752442955970764, "num_tokens": 25635157.0, "step": 4664, "train/ce_loss": 0.5986393690109253 }, { "epoch": 0.46114297014039946, "step": 4664, "train/sim_loss": 0.02734375 }, { "epoch": 0.46114297014039946, "step": 4664, "train/total_loss": 0.08720768988132477 }, { "entropy": 8.57357120513916, "epoch": 0.461241842989915, "mean_token_accuracy": 0.667560338973999, "num_tokens": 25640808.0, "step": 4665, "train/ce_loss": 1.0299381017684937 }, { "epoch": 0.461241842989915, "step": 4665, "train/sim_loss": 0.03125 }, { "epoch": 0.461241842989915, "step": 4665, "train/total_loss": 0.13424381613731384 }, { "entropy": 8.750938415527344, "epoch": 0.4613407158394305, "mean_token_accuracy": 0.7856341004371643, "num_tokens": 25646322.0, "step": 4666, "train/ce_loss": 1.094665288925171 }, { "epoch": 0.4613407158394305, "step": 4666, "train/sim_loss": 0.078125 }, { "epoch": 0.4613407158394305, "step": 4666, "train/total_loss": 0.1875915229320526 }, { "entropy": 8.827033996582031, "epoch": 0.461439588688946, "mean_token_accuracy": 0.7943037748336792, "num_tokens": 25651891.0, "step": 4667, "train/ce_loss": 0.7817019820213318 }, { "epoch": 0.461439588688946, "step": 4667, "train/sim_loss": 0.05078125 }, { "epoch": 0.461439588688946, "step": 4667, "train/total_loss": 0.12895146012306213 }, { "entropy": 8.932878494262695, "epoch": 0.46153846153846156, "mean_token_accuracy": 0.763557493686676, "num_tokens": 25657424.0, "step": 4668, "train/ce_loss": 0.3421185314655304 }, { "epoch": 0.46153846153846156, "step": 4668, "train/sim_loss": 0.0625 }, { "epoch": 0.46153846153846156, "step": 4668, "train/total_loss": 0.09671185910701752 }, { "entropy": 8.635307312011719, "epoch": 0.46163733438797705, "mean_token_accuracy": 0.7676387429237366, "num_tokens": 25663107.0, "step": 4669, "train/ce_loss": 0.6634699106216431 }, { "epoch": 0.46163733438797705, "step": 4669, "train/sim_loss": 0.03515625 }, { "epoch": 0.46163733438797705, "step": 4669, "train/total_loss": 0.10150324553251266 }, { "entropy": 8.739557266235352, "epoch": 0.4617362072374926, "mean_token_accuracy": 0.7839999794960022, "num_tokens": 25668619.0, "step": 4670, "train/ce_loss": 0.7550654411315918 }, { "epoch": 0.4617362072374926, "step": 4670, "train/sim_loss": 0.03125 }, { "epoch": 0.4617362072374926, "step": 4670, "train/total_loss": 0.1067565456032753 }, { "entropy": 9.104496002197266, "epoch": 0.46183508008700813, "mean_token_accuracy": 0.7668328881263733, "num_tokens": 25674137.0, "step": 4671, "train/ce_loss": 1.0781469345092773 }, { "epoch": 0.46183508008700813, "step": 4671, "train/sim_loss": 0.09375 }, { "epoch": 0.46183508008700813, "step": 4671, "train/total_loss": 0.2015646994113922 }, { "entropy": 8.474949836730957, "epoch": 0.4619339529365236, "mean_token_accuracy": 0.7183364629745483, "num_tokens": 25679738.0, "step": 4672, "train/ce_loss": 1.7723780870437622 }, { "epoch": 0.4619339529365236, "step": 4672, "train/sim_loss": 0.09375 }, { "epoch": 0.4619339529365236, "step": 4672, "train/total_loss": 0.2709878087043762 }, { "entropy": 8.55471420288086, "epoch": 0.46203282578603916, "mean_token_accuracy": 0.7267932295799255, "num_tokens": 25685360.0, "step": 4673, "train/ce_loss": 0.8101972937583923 }, { "epoch": 0.46203282578603916, "step": 4673, "train/sim_loss": 0.06640625 }, { "epoch": 0.46203282578603916, "step": 4673, "train/total_loss": 0.14742597937583923 }, { "entropy": 8.76706314086914, "epoch": 0.4621316986355547, "mean_token_accuracy": 0.6860730648040771, "num_tokens": 25690817.0, "step": 4674, "train/ce_loss": 1.2792823314666748 }, { "epoch": 0.4621316986355547, "step": 4674, "train/sim_loss": 0.11328125 }, { "epoch": 0.4621316986355547, "step": 4674, "train/total_loss": 0.2412094920873642 }, { "entropy": 9.04104995727539, "epoch": 0.4622305714850702, "mean_token_accuracy": 0.6891252994537354, "num_tokens": 25696362.0, "step": 4675, "train/ce_loss": 1.5462439060211182 }, { "epoch": 0.4622305714850702, "step": 4675, "train/sim_loss": 0.07421875 }, { "epoch": 0.4622305714850702, "step": 4675, "train/total_loss": 0.22884313762187958 }, { "entropy": 8.685951232910156, "epoch": 0.4623294443345857, "mean_token_accuracy": 0.7224975228309631, "num_tokens": 25701845.0, "step": 4676, "train/ce_loss": 0.5566030144691467 }, { "epoch": 0.4623294443345857, "step": 4676, "train/sim_loss": 0.05859375 }, { "epoch": 0.4623294443345857, "step": 4676, "train/total_loss": 0.11425405740737915 }, { "entropy": 8.811457633972168, "epoch": 0.46242831718410127, "mean_token_accuracy": 0.7439824938774109, "num_tokens": 25707367.0, "step": 4677, "train/ce_loss": 0.6919662356376648 }, { "epoch": 0.46242831718410127, "step": 4677, "train/sim_loss": 0.046875 }, { "epoch": 0.46242831718410127, "step": 4677, "train/total_loss": 0.11607162654399872 }, { "entropy": 8.686164855957031, "epoch": 0.46252719003361675, "mean_token_accuracy": 0.7201327681541443, "num_tokens": 25712950.0, "step": 4678, "train/ce_loss": 0.8955789804458618 }, { "epoch": 0.46252719003361675, "step": 4678, "train/sim_loss": 0.046875 }, { "epoch": 0.46252719003361675, "step": 4678, "train/total_loss": 0.13643290102481842 }, { "entropy": 8.559389114379883, "epoch": 0.4626260628831323, "mean_token_accuracy": 0.7290686964988708, "num_tokens": 25718500.0, "step": 4679, "train/ce_loss": 0.4146448075771332 }, { "epoch": 0.4626260628831323, "step": 4679, "train/sim_loss": 0.046875 }, { "epoch": 0.4626260628831323, "step": 4679, "train/total_loss": 0.08833947777748108 }, { "epoch": 0.46272493573264784, "grad_norm": 0.6352092623710632, "learning_rate": 8.845621322256837e-06, "loss": 0.1401, "step": 4680 }, { "entropy": 9.031084060668945, "epoch": 0.46272493573264784, "mean_token_accuracy": 0.7264021635055542, "num_tokens": 25723889.0, "step": 4680, "train/ce_loss": 0.9964166879653931 }, { "epoch": 0.46272493573264784, "step": 4680, "train/sim_loss": 0.0546875 }, { "epoch": 0.46272493573264784, "step": 4680, "train/total_loss": 0.15432918071746826 }, { "entropy": 8.754541397094727, "epoch": 0.4628238085821633, "mean_token_accuracy": 0.8026859760284424, "num_tokens": 25729468.0, "step": 4681, "train/ce_loss": 0.45627859234809875 }, { "epoch": 0.4628238085821633, "step": 4681, "train/sim_loss": 0.01953125 }, { "epoch": 0.4628238085821633, "step": 4681, "train/total_loss": 0.06515911221504211 }, { "entropy": 9.042980194091797, "epoch": 0.46292268143167886, "mean_token_accuracy": 0.7116883397102356, "num_tokens": 25734826.0, "step": 4682, "train/ce_loss": 0.8736856579780579 }, { "epoch": 0.46292268143167886, "step": 4682, "train/sim_loss": 0.03125 }, { "epoch": 0.46292268143167886, "step": 4682, "train/total_loss": 0.11861857026815414 }, { "entropy": 9.094419479370117, "epoch": 0.4630215542811944, "mean_token_accuracy": 0.7101265788078308, "num_tokens": 25740273.0, "step": 4683, "train/ce_loss": 0.5345808863639832 }, { "epoch": 0.4630215542811944, "step": 4683, "train/sim_loss": 0.10546875 }, { "epoch": 0.4630215542811944, "step": 4683, "train/total_loss": 0.1589268445968628 }, { "entropy": 8.707557678222656, "epoch": 0.4631204271307099, "mean_token_accuracy": 0.7018970251083374, "num_tokens": 25745939.0, "step": 4684, "train/ce_loss": 0.5001704096794128 }, { "epoch": 0.4631204271307099, "step": 4684, "train/sim_loss": 0.0859375 }, { "epoch": 0.4631204271307099, "step": 4684, "train/total_loss": 0.13595454394817352 }, { "entropy": 8.877174377441406, "epoch": 0.46321929998022543, "mean_token_accuracy": 0.7350157499313354, "num_tokens": 25751411.0, "step": 4685, "train/ce_loss": 1.2219676971435547 }, { "epoch": 0.46321929998022543, "step": 4685, "train/sim_loss": 0.125 }, { "epoch": 0.46321929998022543, "step": 4685, "train/total_loss": 0.247196763753891 }, { "entropy": 8.622108459472656, "epoch": 0.46331817282974097, "mean_token_accuracy": 0.7388211488723755, "num_tokens": 25756988.0, "step": 4686, "train/ce_loss": 1.0111274719238281 }, { "epoch": 0.46331817282974097, "step": 4686, "train/sim_loss": 0.03125 }, { "epoch": 0.46331817282974097, "step": 4686, "train/total_loss": 0.1323627531528473 }, { "entropy": 8.756128311157227, "epoch": 0.46341704567925646, "mean_token_accuracy": 0.7147846221923828, "num_tokens": 25762483.0, "step": 4687, "train/ce_loss": 0.3884861469268799 }, { "epoch": 0.46341704567925646, "step": 4687, "train/sim_loss": 0.046875 }, { "epoch": 0.46341704567925646, "step": 4687, "train/total_loss": 0.08572361618280411 }, { "entropy": 9.09113597869873, "epoch": 0.463515918528772, "mean_token_accuracy": 0.7403246164321899, "num_tokens": 25767879.0, "step": 4688, "train/ce_loss": 0.7233476638793945 }, { "epoch": 0.463515918528772, "step": 4688, "train/sim_loss": 0.0390625 }, { "epoch": 0.463515918528772, "step": 4688, "train/total_loss": 0.11139726638793945 }, { "entropy": 8.926886558532715, "epoch": 0.46361479137828754, "mean_token_accuracy": 0.7733026742935181, "num_tokens": 25773390.0, "step": 4689, "train/ce_loss": 0.6724098324775696 }, { "epoch": 0.46361479137828754, "step": 4689, "train/sim_loss": 0.06640625 }, { "epoch": 0.46361479137828754, "step": 4689, "train/total_loss": 0.13364723324775696 }, { "entropy": 8.85993766784668, "epoch": 0.463713664227803, "mean_token_accuracy": 0.686658501625061, "num_tokens": 25778861.0, "step": 4690, "train/ce_loss": 0.4697701930999756 }, { "epoch": 0.463713664227803, "step": 4690, "train/sim_loss": 0.046875 }, { "epoch": 0.463713664227803, "step": 4690, "train/total_loss": 0.09385202080011368 }, { "entropy": 8.896854400634766, "epoch": 0.46381253707731857, "mean_token_accuracy": 0.7316784858703613, "num_tokens": 25784296.0, "step": 4691, "train/ce_loss": 0.6953104734420776 }, { "epoch": 0.46381253707731857, "step": 4691, "train/sim_loss": 0.046875 }, { "epoch": 0.46381253707731857, "step": 4691, "train/total_loss": 0.11640604585409164 }, { "entropy": 8.809329986572266, "epoch": 0.4639114099268341, "mean_token_accuracy": 0.7305825352668762, "num_tokens": 25789804.0, "step": 4692, "train/ce_loss": 0.7745881080627441 }, { "epoch": 0.4639114099268341, "step": 4692, "train/sim_loss": 0.046875 }, { "epoch": 0.4639114099268341, "step": 4692, "train/total_loss": 0.12433381378650665 }, { "entropy": 9.079971313476562, "epoch": 0.4640102827763496, "mean_token_accuracy": 0.7224435806274414, "num_tokens": 25795105.0, "step": 4693, "train/ce_loss": 0.5507135391235352 }, { "epoch": 0.4640102827763496, "step": 4693, "train/sim_loss": 0.05859375 }, { "epoch": 0.4640102827763496, "step": 4693, "train/total_loss": 0.11366510391235352 }, { "entropy": 8.80682373046875, "epoch": 0.46410915562586513, "mean_token_accuracy": 0.7848517894744873, "num_tokens": 25800642.0, "step": 4694, "train/ce_loss": 0.5212928652763367 }, { "epoch": 0.46410915562586513, "step": 4694, "train/sim_loss": 0.0234375 }, { "epoch": 0.46410915562586513, "step": 4694, "train/total_loss": 0.07556678354740143 }, { "entropy": 8.6260986328125, "epoch": 0.4642080284753807, "mean_token_accuracy": 0.8019693493843079, "num_tokens": 25806216.0, "step": 4695, "train/ce_loss": 0.8238219618797302 }, { "epoch": 0.4642080284753807, "step": 4695, "train/sim_loss": 0.0390625 }, { "epoch": 0.4642080284753807, "step": 4695, "train/total_loss": 0.1214446946978569 }, { "entropy": 8.797420501708984, "epoch": 0.46430690132489616, "mean_token_accuracy": 0.7335423231124878, "num_tokens": 25811840.0, "step": 4696, "train/ce_loss": 1.4650239944458008 }, { "epoch": 0.46430690132489616, "step": 4696, "train/sim_loss": 0.08203125 }, { "epoch": 0.46430690132489616, "step": 4696, "train/total_loss": 0.22853365540504456 }, { "entropy": 8.96566390991211, "epoch": 0.4644057741744117, "mean_token_accuracy": 0.7333333492279053, "num_tokens": 25817291.0, "step": 4697, "train/ce_loss": 1.2886356115341187 }, { "epoch": 0.4644057741744117, "step": 4697, "train/sim_loss": 0.0703125 }, { "epoch": 0.4644057741744117, "step": 4697, "train/total_loss": 0.19917605817317963 }, { "entropy": 8.990910530090332, "epoch": 0.46450464702392724, "mean_token_accuracy": 0.7770193219184875, "num_tokens": 25822936.0, "step": 4698, "train/ce_loss": 0.5632688999176025 }, { "epoch": 0.46450464702392724, "step": 4698, "train/sim_loss": 0.109375 }, { "epoch": 0.46450464702392724, "step": 4698, "train/total_loss": 0.16570189595222473 }, { "entropy": 8.948915481567383, "epoch": 0.4646035198734427, "mean_token_accuracy": 0.7710280418395996, "num_tokens": 25828510.0, "step": 4699, "train/ce_loss": 0.6864312887191772 }, { "epoch": 0.4646035198734427, "step": 4699, "train/sim_loss": 0.03125 }, { "epoch": 0.4646035198734427, "step": 4699, "train/total_loss": 0.09989313036203384 }, { "epoch": 0.46470239272295827, "grad_norm": 0.560984194278717, "learning_rate": 8.840676457498888e-06, "loss": 0.1395, "step": 4700 }, { "entropy": 9.136213302612305, "epoch": 0.46470239272295827, "mean_token_accuracy": 0.7121418714523315, "num_tokens": 25833819.0, "step": 4700, "train/ce_loss": 0.7322628498077393 }, { "epoch": 0.46470239272295827, "step": 4700, "train/sim_loss": 0.0703125 }, { "epoch": 0.46470239272295827, "step": 4700, "train/total_loss": 0.14353878796100616 }, { "entropy": 9.01425552368164, "epoch": 0.4648012655724738, "mean_token_accuracy": 0.7847025394439697, "num_tokens": 25839161.0, "step": 4701, "train/ce_loss": 1.1679331064224243 }, { "epoch": 0.4648012655724738, "step": 4701, "train/sim_loss": 0.04296875 }, { "epoch": 0.4648012655724738, "step": 4701, "train/total_loss": 0.15976205468177795 }, { "entropy": 8.661323547363281, "epoch": 0.4649001384219893, "mean_token_accuracy": 0.7454153299331665, "num_tokens": 25844710.0, "step": 4702, "train/ce_loss": 0.7263745665550232 }, { "epoch": 0.4649001384219893, "step": 4702, "train/sim_loss": 0.03515625 }, { "epoch": 0.4649001384219893, "step": 4702, "train/total_loss": 0.10779371112585068 }, { "entropy": 9.367576599121094, "epoch": 0.46499901127150484, "mean_token_accuracy": 0.7889763712882996, "num_tokens": 25849930.0, "step": 4703, "train/ce_loss": 0.7071484327316284 }, { "epoch": 0.46499901127150484, "step": 4703, "train/sim_loss": 0.015625 }, { "epoch": 0.46499901127150484, "step": 4703, "train/total_loss": 0.08633984625339508 }, { "entropy": 8.991268157958984, "epoch": 0.4650978841210204, "mean_token_accuracy": 0.735516369342804, "num_tokens": 25855330.0, "step": 4704, "train/ce_loss": 0.9504019618034363 }, { "epoch": 0.4650978841210204, "step": 4704, "train/sim_loss": 0.0859375 }, { "epoch": 0.4650978841210204, "step": 4704, "train/total_loss": 0.1809777021408081 }, { "entropy": 8.709827423095703, "epoch": 0.4651967569705359, "mean_token_accuracy": 0.7018181681632996, "num_tokens": 25860760.0, "step": 4705, "train/ce_loss": 0.49367645382881165 }, { "epoch": 0.4651967569705359, "step": 4705, "train/sim_loss": 0.02734375 }, { "epoch": 0.4651967569705359, "step": 4705, "train/total_loss": 0.07671140134334564 }, { "entropy": 9.270561218261719, "epoch": 0.4652956298200514, "mean_token_accuracy": 0.7466124892234802, "num_tokens": 25866106.0, "step": 4706, "train/ce_loss": 0.9161360263824463 }, { "epoch": 0.4652956298200514, "step": 4706, "train/sim_loss": 0.0390625 }, { "epoch": 0.4652956298200514, "step": 4706, "train/total_loss": 0.13067610561847687 }, { "entropy": 8.606980323791504, "epoch": 0.46539450266956695, "mean_token_accuracy": 0.7202572226524353, "num_tokens": 25871709.0, "step": 4707, "train/ce_loss": 0.98586505651474 }, { "epoch": 0.46539450266956695, "step": 4707, "train/sim_loss": 0.02734375 }, { "epoch": 0.46539450266956695, "step": 4707, "train/total_loss": 0.12593024969100952 }, { "entropy": 9.122631072998047, "epoch": 0.4654933755190825, "mean_token_accuracy": 0.7392995953559875, "num_tokens": 25877100.0, "step": 4708, "train/ce_loss": 0.7197149395942688 }, { "epoch": 0.4654933755190825, "step": 4708, "train/sim_loss": 0.05859375 }, { "epoch": 0.4654933755190825, "step": 4708, "train/total_loss": 0.13056525588035583 }, { "entropy": 8.78554916381836, "epoch": 0.46559224836859797, "mean_token_accuracy": 0.7788125872612, "num_tokens": 25882565.0, "step": 4709, "train/ce_loss": 0.6517266035079956 }, { "epoch": 0.46559224836859797, "step": 4709, "train/sim_loss": 0.0234375 }, { "epoch": 0.46559224836859797, "step": 4709, "train/total_loss": 0.08861016482114792 }, { "entropy": 8.694735527038574, "epoch": 0.4656911212181135, "mean_token_accuracy": 0.7818382978439331, "num_tokens": 25888129.0, "step": 4710, "train/ce_loss": 0.752456545829773 }, { "epoch": 0.4656911212181135, "step": 4710, "train/sim_loss": 0.03515625 }, { "epoch": 0.4656911212181135, "step": 4710, "train/total_loss": 0.11040190607309341 }, { "entropy": 8.887953758239746, "epoch": 0.46578999406762905, "mean_token_accuracy": 0.7570900321006775, "num_tokens": 25893537.0, "step": 4711, "train/ce_loss": 0.7138849496841431 }, { "epoch": 0.46578999406762905, "step": 4711, "train/sim_loss": 0.0859375 }, { "epoch": 0.46578999406762905, "step": 4711, "train/total_loss": 0.15732599794864655 }, { "entropy": 9.228413581848145, "epoch": 0.46588886691714454, "mean_token_accuracy": 0.7876614332199097, "num_tokens": 25898828.0, "step": 4712, "train/ce_loss": 0.8094720840454102 }, { "epoch": 0.46588886691714454, "step": 4712, "train/sim_loss": 0.0625 }, { "epoch": 0.46588886691714454, "step": 4712, "train/total_loss": 0.14344722032546997 }, { "entropy": 8.990254402160645, "epoch": 0.4659877397666601, "mean_token_accuracy": 0.786620557308197, "num_tokens": 25904223.0, "step": 4713, "train/ce_loss": 0.474127858877182 }, { "epoch": 0.4659877397666601, "step": 4713, "train/sim_loss": 0.04296875 }, { "epoch": 0.4659877397666601, "step": 4713, "train/total_loss": 0.09038153290748596 }, { "entropy": 9.159770965576172, "epoch": 0.4660866126161756, "mean_token_accuracy": 0.7547169923782349, "num_tokens": 25909592.0, "step": 4714, "train/ce_loss": 0.7042274475097656 }, { "epoch": 0.4660866126161756, "step": 4714, "train/sim_loss": 0.0234375 }, { "epoch": 0.4660866126161756, "step": 4714, "train/total_loss": 0.09386024624109268 }, { "entropy": 8.947244644165039, "epoch": 0.4661854854656911, "mean_token_accuracy": 0.7152466177940369, "num_tokens": 25915067.0, "step": 4715, "train/ce_loss": 0.8372911810874939 }, { "epoch": 0.4661854854656911, "step": 4715, "train/sim_loss": 0.0703125 }, { "epoch": 0.4661854854656911, "step": 4715, "train/total_loss": 0.1540416181087494 }, { "entropy": 9.238380432128906, "epoch": 0.46628435831520665, "mean_token_accuracy": 0.7385786771774292, "num_tokens": 25920415.0, "step": 4716, "train/ce_loss": 0.42712607979774475 }, { "epoch": 0.46628435831520665, "step": 4716, "train/sim_loss": 0.01953125 }, { "epoch": 0.46628435831520665, "step": 4716, "train/total_loss": 0.062243860214948654 }, { "entropy": 8.928970336914062, "epoch": 0.4663832311647222, "mean_token_accuracy": 0.7635933756828308, "num_tokens": 25925920.0, "step": 4717, "train/ce_loss": 1.1316108703613281 }, { "epoch": 0.4663832311647222, "step": 4717, "train/sim_loss": 0.09765625 }, { "epoch": 0.4663832311647222, "step": 4717, "train/total_loss": 0.2108173370361328 }, { "entropy": 8.971315383911133, "epoch": 0.4664821040142377, "mean_token_accuracy": 0.7181286811828613, "num_tokens": 25931338.0, "step": 4718, "train/ce_loss": 0.5772479772567749 }, { "epoch": 0.4664821040142377, "step": 4718, "train/sim_loss": 0.0703125 }, { "epoch": 0.4664821040142377, "step": 4718, "train/total_loss": 0.12803730368614197 }, { "entropy": 8.976540565490723, "epoch": 0.4665809768637532, "mean_token_accuracy": 0.7930142283439636, "num_tokens": 25936775.0, "step": 4719, "train/ce_loss": 0.5354031324386597 }, { "epoch": 0.4665809768637532, "step": 4719, "train/sim_loss": 0.04296875 }, { "epoch": 0.4665809768637532, "step": 4719, "train/total_loss": 0.09650906920433044 }, { "epoch": 0.46667984971326876, "grad_norm": 0.6885977983474731, "learning_rate": 8.83573159274094e-06, "loss": 0.1355, "step": 4720 }, { "entropy": 9.150774002075195, "epoch": 0.46667984971326876, "mean_token_accuracy": 0.7115628719329834, "num_tokens": 25942154.0, "step": 4720, "train/ce_loss": 1.0418184995651245 }, { "epoch": 0.46667984971326876, "step": 4720, "train/sim_loss": 0.06640625 }, { "epoch": 0.46667984971326876, "step": 4720, "train/total_loss": 0.17058810591697693 }, { "entropy": 9.086135864257812, "epoch": 0.46677872256278424, "mean_token_accuracy": 0.7183271646499634, "num_tokens": 25947631.0, "step": 4721, "train/ce_loss": 0.9785376191139221 }, { "epoch": 0.46677872256278424, "step": 4721, "train/sim_loss": 0.0625 }, { "epoch": 0.46677872256278424, "step": 4721, "train/total_loss": 0.16035376489162445 }, { "entropy": 8.828842163085938, "epoch": 0.4668775954122998, "mean_token_accuracy": 0.7427293062210083, "num_tokens": 25953181.0, "step": 4722, "train/ce_loss": 0.6624770760536194 }, { "epoch": 0.4668775954122998, "step": 4722, "train/sim_loss": 0.0859375 }, { "epoch": 0.4668775954122998, "step": 4722, "train/total_loss": 0.15218520164489746 }, { "entropy": 9.064956665039062, "epoch": 0.4669764682618153, "mean_token_accuracy": 0.7345678806304932, "num_tokens": 25958619.0, "step": 4723, "train/ce_loss": 0.49713125824928284 }, { "epoch": 0.4669764682618153, "step": 4723, "train/sim_loss": 0.078125 }, { "epoch": 0.4669764682618153, "step": 4723, "train/total_loss": 0.127838134765625 }, { "entropy": 8.789127349853516, "epoch": 0.4670753411113308, "mean_token_accuracy": 0.7995802760124207, "num_tokens": 25964171.0, "step": 4724, "train/ce_loss": 0.3580959439277649 }, { "epoch": 0.4670753411113308, "step": 4724, "train/sim_loss": 0.01953125 }, { "epoch": 0.4670753411113308, "step": 4724, "train/total_loss": 0.05534084513783455 }, { "entropy": 9.176322937011719, "epoch": 0.46717421396084635, "mean_token_accuracy": 0.7609618306159973, "num_tokens": 25969455.0, "step": 4725, "train/ce_loss": 0.6497082114219666 }, { "epoch": 0.46717421396084635, "step": 4725, "train/sim_loss": 0.03515625 }, { "epoch": 0.46717421396084635, "step": 4725, "train/total_loss": 0.10012707114219666 }, { "entropy": 9.176478385925293, "epoch": 0.4672730868103619, "mean_token_accuracy": 0.8228915929794312, "num_tokens": 25974822.0, "step": 4726, "train/ce_loss": 0.5293235182762146 }, { "epoch": 0.4672730868103619, "step": 4726, "train/sim_loss": 0.015625 }, { "epoch": 0.4672730868103619, "step": 4726, "train/total_loss": 0.06855735182762146 }, { "entropy": 8.710596084594727, "epoch": 0.4673719596598774, "mean_token_accuracy": 0.7349726557731628, "num_tokens": 25980207.0, "step": 4727, "train/ce_loss": 0.4157286286354065 }, { "epoch": 0.4673719596598774, "step": 4727, "train/sim_loss": 0.0390625 }, { "epoch": 0.4673719596598774, "step": 4727, "train/total_loss": 0.08063536882400513 }, { "entropy": 9.145271301269531, "epoch": 0.4674708325093929, "mean_token_accuracy": 0.7179487347602844, "num_tokens": 25985795.0, "step": 4728, "train/ce_loss": 1.1262478828430176 }, { "epoch": 0.4674708325093929, "step": 4728, "train/sim_loss": 0.03125 }, { "epoch": 0.4674708325093929, "step": 4728, "train/total_loss": 0.14387479424476624 }, { "entropy": 9.150556564331055, "epoch": 0.46756970535890846, "mean_token_accuracy": 0.7196367979049683, "num_tokens": 25991332.0, "step": 4729, "train/ce_loss": 0.7849563360214233 }, { "epoch": 0.46756970535890846, "step": 4729, "train/sim_loss": 0.0546875 }, { "epoch": 0.46756970535890846, "step": 4729, "train/total_loss": 0.13318313658237457 }, { "entropy": 9.120893478393555, "epoch": 0.46766857820842395, "mean_token_accuracy": 0.7186700701713562, "num_tokens": 25996745.0, "step": 4730, "train/ce_loss": 1.160485029220581 }, { "epoch": 0.46766857820842395, "step": 4730, "train/sim_loss": 0.046875 }, { "epoch": 0.46766857820842395, "step": 4730, "train/total_loss": 0.16292351484298706 }, { "entropy": 9.200052261352539, "epoch": 0.4677674510579395, "mean_token_accuracy": 0.7702311873435974, "num_tokens": 26001998.0, "step": 4731, "train/ce_loss": 0.8952283263206482 }, { "epoch": 0.4677674510579395, "step": 4731, "train/sim_loss": 0.03515625 }, { "epoch": 0.4677674510579395, "step": 4731, "train/total_loss": 0.1246790811419487 }, { "entropy": 8.9644136428833, "epoch": 0.46786632390745503, "mean_token_accuracy": 0.7558139562606812, "num_tokens": 26007368.0, "step": 4732, "train/ce_loss": 0.9910418391227722 }, { "epoch": 0.46786632390745503, "step": 4732, "train/sim_loss": 0.08203125 }, { "epoch": 0.46786632390745503, "step": 4732, "train/total_loss": 0.18113544583320618 }, { "entropy": 8.735422134399414, "epoch": 0.4679651967569705, "mean_token_accuracy": 0.6882951855659485, "num_tokens": 26012778.0, "step": 4733, "train/ce_loss": 0.9790635704994202 }, { "epoch": 0.4679651967569705, "step": 4733, "train/sim_loss": 0.0546875 }, { "epoch": 0.4679651967569705, "step": 4733, "train/total_loss": 0.15259385108947754 }, { "entropy": 8.892043113708496, "epoch": 0.46806406960648606, "mean_token_accuracy": 0.7795637249946594, "num_tokens": 26018212.0, "step": 4734, "train/ce_loss": 0.8840950131416321 }, { "epoch": 0.46806406960648606, "step": 4734, "train/sim_loss": 0.046875 }, { "epoch": 0.46806406960648606, "step": 4734, "train/total_loss": 0.13528451323509216 }, { "entropy": 9.228240966796875, "epoch": 0.4681629424560016, "mean_token_accuracy": 0.7870239615440369, "num_tokens": 26023520.0, "step": 4735, "train/ce_loss": 0.3681926429271698 }, { "epoch": 0.4681629424560016, "step": 4735, "train/sim_loss": 0.07421875 }, { "epoch": 0.4681629424560016, "step": 4735, "train/total_loss": 0.11103801429271698 }, { "entropy": 9.0722017288208, "epoch": 0.4682618153055171, "mean_token_accuracy": 0.753947377204895, "num_tokens": 26028883.0, "step": 4736, "train/ce_loss": 0.8567046523094177 }, { "epoch": 0.4682618153055171, "step": 4736, "train/sim_loss": 0.10546875 }, { "epoch": 0.4682618153055171, "step": 4736, "train/total_loss": 0.19113922119140625 }, { "entropy": 8.833041191101074, "epoch": 0.4683606881550326, "mean_token_accuracy": 0.7202380895614624, "num_tokens": 26034523.0, "step": 4737, "train/ce_loss": 1.8239306211471558 }, { "epoch": 0.4683606881550326, "step": 4737, "train/sim_loss": 0.06640625 }, { "epoch": 0.4683606881550326, "step": 4737, "train/total_loss": 0.24879930913448334 }, { "entropy": 8.782368659973145, "epoch": 0.46845956100454816, "mean_token_accuracy": 0.767241358757019, "num_tokens": 26040044.0, "step": 4738, "train/ce_loss": 0.5488168001174927 }, { "epoch": 0.46845956100454816, "step": 4738, "train/sim_loss": 0.12109375 }, { "epoch": 0.46845956100454816, "step": 4738, "train/total_loss": 0.17597542703151703 }, { "entropy": 8.992538452148438, "epoch": 0.46855843385406365, "mean_token_accuracy": 0.7436241507530212, "num_tokens": 26045378.0, "step": 4739, "train/ce_loss": 0.4719219505786896 }, { "epoch": 0.46855843385406365, "step": 4739, "train/sim_loss": 0.078125 }, { "epoch": 0.46855843385406365, "step": 4739, "train/total_loss": 0.12531720101833344 }, { "epoch": 0.4686573067035792, "grad_norm": 0.7076136469841003, "learning_rate": 8.83078672798299e-06, "loss": 0.1375, "step": 4740 }, { "entropy": 8.639327049255371, "epoch": 0.4686573067035792, "mean_token_accuracy": 0.7505330443382263, "num_tokens": 26050972.0, "step": 4740, "train/ce_loss": 0.529935359954834 }, { "epoch": 0.4686573067035792, "step": 4740, "train/sim_loss": 0.05078125 }, { "epoch": 0.4686573067035792, "step": 4740, "train/total_loss": 0.1037747859954834 }, { "entropy": 8.858098983764648, "epoch": 0.46875617955309473, "mean_token_accuracy": 0.7803680896759033, "num_tokens": 26056450.0, "step": 4741, "train/ce_loss": 0.8601021766662598 }, { "epoch": 0.46875617955309473, "step": 4741, "train/sim_loss": 0.05859375 }, { "epoch": 0.46875617955309473, "step": 4741, "train/total_loss": 0.14460396766662598 }, { "entropy": 9.018011093139648, "epoch": 0.4688550524026102, "mean_token_accuracy": 0.731272280216217, "num_tokens": 26061909.0, "step": 4742, "train/ce_loss": 1.0722084045410156 }, { "epoch": 0.4688550524026102, "step": 4742, "train/sim_loss": 0.1171875 }, { "epoch": 0.4688550524026102, "step": 4742, "train/total_loss": 0.2244083434343338 }, { "entropy": 8.480961799621582, "epoch": 0.46895392525212576, "mean_token_accuracy": 0.7255814075469971, "num_tokens": 26067533.0, "step": 4743, "train/ce_loss": 1.1865861415863037 }, { "epoch": 0.46895392525212576, "step": 4743, "train/sim_loss": 0.05078125 }, { "epoch": 0.46895392525212576, "step": 4743, "train/total_loss": 0.1694398671388626 }, { "entropy": 9.247943878173828, "epoch": 0.4690527981016413, "mean_token_accuracy": 0.7648648619651794, "num_tokens": 26072904.0, "step": 4744, "train/ce_loss": 0.8642345666885376 }, { "epoch": 0.4690527981016413, "step": 4744, "train/sim_loss": 0.05859375 }, { "epoch": 0.4690527981016413, "step": 4744, "train/total_loss": 0.14501720666885376 }, { "entropy": 9.209268569946289, "epoch": 0.4691516709511568, "mean_token_accuracy": 0.6819406747817993, "num_tokens": 26078348.0, "step": 4745, "train/ce_loss": 1.2146711349487305 }, { "epoch": 0.4691516709511568, "step": 4745, "train/sim_loss": 0.0703125 }, { "epoch": 0.4691516709511568, "step": 4745, "train/total_loss": 0.19177961349487305 }, { "entropy": 9.01669692993164, "epoch": 0.4692505438006723, "mean_token_accuracy": 0.8029196858406067, "num_tokens": 26083789.0, "step": 4746, "train/ce_loss": 0.3827207684516907 }, { "epoch": 0.4692505438006723, "step": 4746, "train/sim_loss": 0.0234375 }, { "epoch": 0.4692505438006723, "step": 4746, "train/total_loss": 0.061709579080343246 }, { "entropy": 9.229751586914062, "epoch": 0.46934941665018787, "mean_token_accuracy": 0.7976190447807312, "num_tokens": 26089112.0, "step": 4747, "train/ce_loss": 0.5127663016319275 }, { "epoch": 0.46934941665018787, "step": 4747, "train/sim_loss": 0.08984375 }, { "epoch": 0.46934941665018787, "step": 4747, "train/total_loss": 0.14112037420272827 }, { "entropy": 8.560243606567383, "epoch": 0.4694482894997034, "mean_token_accuracy": 0.7025423645973206, "num_tokens": 26094889.0, "step": 4748, "train/ce_loss": 0.8168653845787048 }, { "epoch": 0.4694482894997034, "step": 4748, "train/sim_loss": 0.05859375 }, { "epoch": 0.4694482894997034, "step": 4748, "train/total_loss": 0.14028029143810272 }, { "entropy": 9.059371948242188, "epoch": 0.4695471623492189, "mean_token_accuracy": 0.693527102470398, "num_tokens": 26100231.0, "step": 4749, "train/ce_loss": 1.0938397645950317 }, { "epoch": 0.4695471623492189, "step": 4749, "train/sim_loss": 0.06640625 }, { "epoch": 0.4695471623492189, "step": 4749, "train/total_loss": 0.1757902204990387 }, { "entropy": 9.15247917175293, "epoch": 0.46964603519873444, "mean_token_accuracy": 0.7353801131248474, "num_tokens": 26105487.0, "step": 4750, "train/ce_loss": 0.4586854577064514 }, { "epoch": 0.46964603519873444, "step": 4750, "train/sim_loss": 0.0390625 }, { "epoch": 0.46964603519873444, "step": 4750, "train/total_loss": 0.08493104577064514 }, { "entropy": 9.028118133544922, "epoch": 0.46974490804825, "mean_token_accuracy": 0.6730310320854187, "num_tokens": 26110835.0, "step": 4751, "train/ce_loss": 0.8156433701515198 }, { "epoch": 0.46974490804825, "step": 4751, "train/sim_loss": 0.078125 }, { "epoch": 0.46974490804825, "step": 4751, "train/total_loss": 0.15968933701515198 }, { "entropy": 8.589207649230957, "epoch": 0.46984378089776546, "mean_token_accuracy": 0.7264808416366577, "num_tokens": 26116598.0, "step": 4752, "train/ce_loss": 1.459617257118225 }, { "epoch": 0.46984378089776546, "step": 4752, "train/sim_loss": 0.05078125 }, { "epoch": 0.46984378089776546, "step": 4752, "train/total_loss": 0.196742981672287 }, { "entropy": 9.317126274108887, "epoch": 0.469942653747281, "mean_token_accuracy": 0.7148594260215759, "num_tokens": 26121860.0, "step": 4753, "train/ce_loss": 1.2928454875946045 }, { "epoch": 0.469942653747281, "step": 4753, "train/sim_loss": 0.03515625 }, { "epoch": 0.469942653747281, "step": 4753, "train/total_loss": 0.1644407957792282 }, { "entropy": 8.939465522766113, "epoch": 0.47004152659679654, "mean_token_accuracy": 0.7559449076652527, "num_tokens": 26127240.0, "step": 4754, "train/ce_loss": 0.5304828882217407 }, { "epoch": 0.47004152659679654, "step": 4754, "train/sim_loss": 0.015625 }, { "epoch": 0.47004152659679654, "step": 4754, "train/total_loss": 0.06867329031229019 }, { "entropy": 8.651453971862793, "epoch": 0.47014039944631203, "mean_token_accuracy": 0.7276995182037354, "num_tokens": 26132844.0, "step": 4755, "train/ce_loss": 0.5522825121879578 }, { "epoch": 0.47014039944631203, "step": 4755, "train/sim_loss": 0.04296875 }, { "epoch": 0.47014039944631203, "step": 4755, "train/total_loss": 0.09819699823856354 }, { "entropy": 9.027078628540039, "epoch": 0.47023927229582757, "mean_token_accuracy": 0.7453826069831848, "num_tokens": 26138233.0, "step": 4756, "train/ce_loss": 0.5189045667648315 }, { "epoch": 0.47023927229582757, "step": 4756, "train/sim_loss": 0.03515625 }, { "epoch": 0.47023927229582757, "step": 4756, "train/total_loss": 0.08704671263694763 }, { "entropy": 9.1876802444458, "epoch": 0.4703381451453431, "mean_token_accuracy": 0.7433510422706604, "num_tokens": 26143622.0, "step": 4757, "train/ce_loss": 0.6583666801452637 }, { "epoch": 0.4703381451453431, "step": 4757, "train/sim_loss": 0.0703125 }, { "epoch": 0.4703381451453431, "step": 4757, "train/total_loss": 0.13614916801452637 }, { "entropy": 8.554146766662598, "epoch": 0.4704370179948586, "mean_token_accuracy": 0.7290748953819275, "num_tokens": 26149279.0, "step": 4758, "train/ce_loss": 0.7541407942771912 }, { "epoch": 0.4704370179948586, "step": 4758, "train/sim_loss": 0.0234375 }, { "epoch": 0.4704370179948586, "step": 4758, "train/total_loss": 0.09885158389806747 }, { "entropy": 9.100502014160156, "epoch": 0.47053589084437414, "mean_token_accuracy": 0.762326180934906, "num_tokens": 26154619.0, "step": 4759, "train/ce_loss": 1.250136137008667 }, { "epoch": 0.47053589084437414, "step": 4759, "train/sim_loss": 0.07421875 }, { "epoch": 0.47053589084437414, "step": 4759, "train/total_loss": 0.19923236966133118 }, { "epoch": 0.4706347636938897, "grad_norm": 0.7178350687026978, "learning_rate": 8.825841863225042e-06, "loss": 0.1458, "step": 4760 }, { "entropy": 8.782573699951172, "epoch": 0.4706347636938897, "mean_token_accuracy": 0.6637512445449829, "num_tokens": 26160244.0, "step": 4760, "train/ce_loss": 0.5107755661010742 }, { "epoch": 0.4706347636938897, "step": 4760, "train/sim_loss": 0.03125 }, { "epoch": 0.4706347636938897, "step": 4760, "train/total_loss": 0.08232755959033966 }, { "entropy": 8.666130065917969, "epoch": 0.47073363654340517, "mean_token_accuracy": 0.7520759105682373, "num_tokens": 26165732.0, "step": 4761, "train/ce_loss": 1.2486469745635986 }, { "epoch": 0.47073363654340517, "step": 4761, "train/sim_loss": 0.0859375 }, { "epoch": 0.47073363654340517, "step": 4761, "train/total_loss": 0.21080219745635986 }, { "entropy": 8.99913215637207, "epoch": 0.4708325093929207, "mean_token_accuracy": 0.720818281173706, "num_tokens": 26171113.0, "step": 4762, "train/ce_loss": 0.3469093143939972 }, { "epoch": 0.4708325093929207, "step": 4762, "train/sim_loss": 0.07421875 }, { "epoch": 0.4708325093929207, "step": 4762, "train/total_loss": 0.10890968143939972 }, { "entropy": 8.997546195983887, "epoch": 0.47093138224243625, "mean_token_accuracy": 0.740645170211792, "num_tokens": 26176476.0, "step": 4763, "train/ce_loss": 0.7262362837791443 }, { "epoch": 0.47093138224243625, "step": 4763, "train/sim_loss": 0.0234375 }, { "epoch": 0.47093138224243625, "step": 4763, "train/total_loss": 0.09606113284826279 }, { "entropy": 8.677789688110352, "epoch": 0.47103025509195173, "mean_token_accuracy": 0.704901933670044, "num_tokens": 26182061.0, "step": 4764, "train/ce_loss": 0.9025349617004395 }, { "epoch": 0.47103025509195173, "step": 4764, "train/sim_loss": 0.0546875 }, { "epoch": 0.47103025509195173, "step": 4764, "train/total_loss": 0.14494100213050842 }, { "entropy": 9.152905464172363, "epoch": 0.4711291279414673, "mean_token_accuracy": 0.724465548992157, "num_tokens": 26187454.0, "step": 4765, "train/ce_loss": 1.0233715772628784 }, { "epoch": 0.4711291279414673, "step": 4765, "train/sim_loss": 0.078125 }, { "epoch": 0.4711291279414673, "step": 4765, "train/total_loss": 0.18046215176582336 }, { "entropy": 9.043693542480469, "epoch": 0.4712280007909828, "mean_token_accuracy": 0.7406014800071716, "num_tokens": 26192885.0, "step": 4766, "train/ce_loss": 0.9749665856361389 }, { "epoch": 0.4712280007909828, "step": 4766, "train/sim_loss": 0.09765625 }, { "epoch": 0.4712280007909828, "step": 4766, "train/total_loss": 0.1951529085636139 }, { "entropy": 8.964509963989258, "epoch": 0.4713268736404983, "mean_token_accuracy": 0.7241379022598267, "num_tokens": 26198363.0, "step": 4767, "train/ce_loss": 1.1905009746551514 }, { "epoch": 0.4713268736404983, "step": 4767, "train/sim_loss": 0.109375 }, { "epoch": 0.4713268736404983, "step": 4767, "train/total_loss": 0.22842510044574738 }, { "entropy": 8.995786666870117, "epoch": 0.47142574649001384, "mean_token_accuracy": 0.761904776096344, "num_tokens": 26203805.0, "step": 4768, "train/ce_loss": 0.8530048727989197 }, { "epoch": 0.47142574649001384, "step": 4768, "train/sim_loss": 0.046875 }, { "epoch": 0.47142574649001384, "step": 4768, "train/total_loss": 0.1321754902601242 }, { "entropy": 8.737541198730469, "epoch": 0.4715246193395294, "mean_token_accuracy": 0.7882978916168213, "num_tokens": 26209401.0, "step": 4769, "train/ce_loss": 0.3661430776119232 }, { "epoch": 0.4715246193395294, "step": 4769, "train/sim_loss": 0.0234375 }, { "epoch": 0.4715246193395294, "step": 4769, "train/total_loss": 0.0600518099963665 }, { "entropy": 9.017902374267578, "epoch": 0.47162349218904487, "mean_token_accuracy": 0.7210718393325806, "num_tokens": 26214812.0, "step": 4770, "train/ce_loss": 0.42708489298820496 }, { "epoch": 0.47162349218904487, "step": 4770, "train/sim_loss": 0.0859375 }, { "epoch": 0.47162349218904487, "step": 4770, "train/total_loss": 0.12864598631858826 }, { "entropy": 8.768732070922852, "epoch": 0.4717223650385604, "mean_token_accuracy": 0.7733026742935181, "num_tokens": 26220254.0, "step": 4771, "train/ce_loss": 0.77610844373703 }, { "epoch": 0.4717223650385604, "step": 4771, "train/sim_loss": 0.03125 }, { "epoch": 0.4717223650385604, "step": 4771, "train/total_loss": 0.10886084288358688 }, { "entropy": 8.943436622619629, "epoch": 0.47182123788807595, "mean_token_accuracy": 0.7168458700180054, "num_tokens": 26225734.0, "step": 4772, "train/ce_loss": 0.8204036355018616 }, { "epoch": 0.47182123788807595, "step": 4772, "train/sim_loss": 0.046875 }, { "epoch": 0.47182123788807595, "step": 4772, "train/total_loss": 0.12891536951065063 }, { "entropy": 8.63354206085205, "epoch": 0.47192011073759144, "mean_token_accuracy": 0.7211538553237915, "num_tokens": 26231449.0, "step": 4773, "train/ce_loss": 0.9448675513267517 }, { "epoch": 0.47192011073759144, "step": 4773, "train/sim_loss": 0.05859375 }, { "epoch": 0.47192011073759144, "step": 4773, "train/total_loss": 0.1530805081129074 }, { "entropy": 9.185686111450195, "epoch": 0.472018983587107, "mean_token_accuracy": 0.7503545880317688, "num_tokens": 26236686.0, "step": 4774, "train/ce_loss": 0.5171141624450684 }, { "epoch": 0.472018983587107, "step": 4774, "train/sim_loss": 0.0390625 }, { "epoch": 0.472018983587107, "step": 4774, "train/total_loss": 0.09077391773462296 }, { "entropy": 9.207576751708984, "epoch": 0.4721178564366225, "mean_token_accuracy": 0.7564895153045654, "num_tokens": 26242046.0, "step": 4775, "train/ce_loss": 0.7164689898490906 }, { "epoch": 0.4721178564366225, "step": 4775, "train/sim_loss": 0.06640625 }, { "epoch": 0.4721178564366225, "step": 4775, "train/total_loss": 0.13805314898490906 }, { "entropy": 9.09940242767334, "epoch": 0.472216729286138, "mean_token_accuracy": 0.7229729890823364, "num_tokens": 26247742.0, "step": 4776, "train/ce_loss": 0.542309582233429 }, { "epoch": 0.472216729286138, "step": 4776, "train/sim_loss": 0.07421875 }, { "epoch": 0.472216729286138, "step": 4776, "train/total_loss": 0.1284497082233429 }, { "entropy": 8.76095199584961, "epoch": 0.47231560213565355, "mean_token_accuracy": 0.8303213119506836, "num_tokens": 26253407.0, "step": 4777, "train/ce_loss": 0.4871494770050049 }, { "epoch": 0.47231560213565355, "step": 4777, "train/sim_loss": 0.078125 }, { "epoch": 0.47231560213565355, "step": 4777, "train/total_loss": 0.12683995068073273 }, { "entropy": 8.855828285217285, "epoch": 0.4724144749851691, "mean_token_accuracy": 0.7643678188323975, "num_tokens": 26258932.0, "step": 4778, "train/ce_loss": 0.46188706159591675 }, { "epoch": 0.4724144749851691, "step": 4778, "train/sim_loss": 0.0546875 }, { "epoch": 0.4724144749851691, "step": 4778, "train/total_loss": 0.10087621212005615 }, { "entropy": 8.589308738708496, "epoch": 0.47251334783468457, "mean_token_accuracy": 0.7427007555961609, "num_tokens": 26264690.0, "step": 4779, "train/ce_loss": 0.507284939289093 }, { "epoch": 0.47251334783468457, "step": 4779, "train/sim_loss": 0.05859375 }, { "epoch": 0.47251334783468457, "step": 4779, "train/total_loss": 0.10932224988937378 }, { "epoch": 0.4726122206842001, "grad_norm": 0.703830361366272, "learning_rate": 8.820896998467093e-06, "loss": 0.1429, "step": 4780 }, { "entropy": 8.130598068237305, "epoch": 0.4726122206842001, "mean_token_accuracy": 0.706962525844574, "num_tokens": 26270540.0, "step": 4780, "train/ce_loss": 0.31976455450057983 }, { "epoch": 0.4726122206842001, "step": 4780, "train/sim_loss": 0.109375 }, { "epoch": 0.4726122206842001, "step": 4780, "train/total_loss": 0.14135146141052246 }, { "entropy": 8.986353874206543, "epoch": 0.47271109353371565, "mean_token_accuracy": 0.6699029207229614, "num_tokens": 26276114.0, "step": 4781, "train/ce_loss": 1.6956143379211426 }, { "epoch": 0.47271109353371565, "step": 4781, "train/sim_loss": 0.08984375 }, { "epoch": 0.47271109353371565, "step": 4781, "train/total_loss": 0.2594051957130432 }, { "entropy": 8.7137451171875, "epoch": 0.47280996638323114, "mean_token_accuracy": 0.7822409868240356, "num_tokens": 26281732.0, "step": 4782, "train/ce_loss": 0.6179150342941284 }, { "epoch": 0.47280996638323114, "step": 4782, "train/sim_loss": 0.13671875 }, { "epoch": 0.47280996638323114, "step": 4782, "train/total_loss": 0.19851025938987732 }, { "entropy": 9.004026412963867, "epoch": 0.4729088392327467, "mean_token_accuracy": 0.6819148659706116, "num_tokens": 26287239.0, "step": 4783, "train/ce_loss": 0.4617450535297394 }, { "epoch": 0.4729088392327467, "step": 4783, "train/sim_loss": 0.0625 }, { "epoch": 0.4729088392327467, "step": 4783, "train/total_loss": 0.10867451131343842 }, { "entropy": 9.094558715820312, "epoch": 0.4730077120822622, "mean_token_accuracy": 0.782608687877655, "num_tokens": 26292627.0, "step": 4784, "train/ce_loss": 1.0191675424575806 }, { "epoch": 0.4730077120822622, "step": 4784, "train/sim_loss": 0.0390625 }, { "epoch": 0.4730077120822622, "step": 4784, "train/total_loss": 0.14097926020622253 }, { "entropy": 8.882908821105957, "epoch": 0.4731065849317777, "mean_token_accuracy": 0.7256830334663391, "num_tokens": 26298161.0, "step": 4785, "train/ce_loss": 1.230420470237732 }, { "epoch": 0.4731065849317777, "step": 4785, "train/sim_loss": 0.03125 }, { "epoch": 0.4731065849317777, "step": 4785, "train/total_loss": 0.1542920470237732 }, { "entropy": 8.898198127746582, "epoch": 0.47320545778129325, "mean_token_accuracy": 0.771327018737793, "num_tokens": 26303593.0, "step": 4786, "train/ce_loss": 1.2235711812973022 }, { "epoch": 0.47320545778129325, "step": 4786, "train/sim_loss": 0.07421875 }, { "epoch": 0.47320545778129325, "step": 4786, "train/total_loss": 0.19657588005065918 }, { "entropy": 8.864927291870117, "epoch": 0.4733043306308088, "mean_token_accuracy": 0.7077625393867493, "num_tokens": 26309096.0, "step": 4787, "train/ce_loss": 1.48554527759552 }, { "epoch": 0.4733043306308088, "step": 4787, "train/sim_loss": 0.07421875 }, { "epoch": 0.4733043306308088, "step": 4787, "train/total_loss": 0.22277328372001648 }, { "entropy": 9.00640869140625, "epoch": 0.47340320348032433, "mean_token_accuracy": 0.7367802858352661, "num_tokens": 26314542.0, "step": 4788, "train/ce_loss": 0.4521860182285309 }, { "epoch": 0.47340320348032433, "step": 4788, "train/sim_loss": 0.03125 }, { "epoch": 0.47340320348032433, "step": 4788, "train/total_loss": 0.07646860182285309 }, { "entropy": 8.49836540222168, "epoch": 0.4735020763298398, "mean_token_accuracy": 0.6921075582504272, "num_tokens": 26320244.0, "step": 4789, "train/ce_loss": 0.5520185232162476 }, { "epoch": 0.4735020763298398, "step": 4789, "train/sim_loss": 0.03125 }, { "epoch": 0.4735020763298398, "step": 4789, "train/total_loss": 0.08645185828208923 }, { "entropy": 9.000633239746094, "epoch": 0.47360094917935536, "mean_token_accuracy": 0.7348130941390991, "num_tokens": 26325795.0, "step": 4790, "train/ce_loss": 0.9298432469367981 }, { "epoch": 0.47360094917935536, "step": 4790, "train/sim_loss": 0.12890625 }, { "epoch": 0.47360094917935536, "step": 4790, "train/total_loss": 0.22189056873321533 }, { "entropy": 8.949196815490723, "epoch": 0.4736998220288709, "mean_token_accuracy": 0.799559473991394, "num_tokens": 26331317.0, "step": 4791, "train/ce_loss": 0.46504610776901245 }, { "epoch": 0.4736998220288709, "step": 4791, "train/sim_loss": 0.0234375 }, { "epoch": 0.4736998220288709, "step": 4791, "train/total_loss": 0.06994211673736572 }, { "entropy": 8.987907409667969, "epoch": 0.4737986948783864, "mean_token_accuracy": 0.7603305578231812, "num_tokens": 26336747.0, "step": 4792, "train/ce_loss": 0.6997787356376648 }, { "epoch": 0.4737986948783864, "step": 4792, "train/sim_loss": 0.06640625 }, { "epoch": 0.4737986948783864, "step": 4792, "train/total_loss": 0.13638412952423096 }, { "entropy": 9.150917053222656, "epoch": 0.4738975677279019, "mean_token_accuracy": 0.7809523940086365, "num_tokens": 26342112.0, "step": 4793, "train/ce_loss": 0.5445130467414856 }, { "epoch": 0.4738975677279019, "step": 4793, "train/sim_loss": 0.0390625 }, { "epoch": 0.4738975677279019, "step": 4793, "train/total_loss": 0.09351380169391632 }, { "entropy": 9.082426071166992, "epoch": 0.47399644057741747, "mean_token_accuracy": 0.7292191386222839, "num_tokens": 26347499.0, "step": 4794, "train/ce_loss": 1.2849928140640259 }, { "epoch": 0.47399644057741747, "step": 4794, "train/sim_loss": 0.0625 }, { "epoch": 0.47399644057741747, "step": 4794, "train/total_loss": 0.19099928438663483 }, { "entropy": 8.984776496887207, "epoch": 0.47409531342693295, "mean_token_accuracy": 0.7270471453666687, "num_tokens": 26352963.0, "step": 4795, "train/ce_loss": 1.4183458089828491 }, { "epoch": 0.47409531342693295, "step": 4795, "train/sim_loss": 0.046875 }, { "epoch": 0.47409531342693295, "step": 4795, "train/total_loss": 0.1887095868587494 }, { "entropy": 8.929255485534668, "epoch": 0.4741941862764485, "mean_token_accuracy": 0.7158671617507935, "num_tokens": 26358333.0, "step": 4796, "train/ce_loss": 0.8449040651321411 }, { "epoch": 0.4741941862764485, "step": 4796, "train/sim_loss": 0.10546875 }, { "epoch": 0.4741941862764485, "step": 4796, "train/total_loss": 0.18995916843414307 }, { "entropy": 8.799193382263184, "epoch": 0.47429305912596403, "mean_token_accuracy": 0.7387475371360779, "num_tokens": 26363892.0, "step": 4797, "train/ce_loss": 0.48724815249443054 }, { "epoch": 0.47429305912596403, "step": 4797, "train/sim_loss": 0.0625 }, { "epoch": 0.47429305912596403, "step": 4797, "train/total_loss": 0.11122481524944305 }, { "entropy": 8.444228172302246, "epoch": 0.4743919319754795, "mean_token_accuracy": 0.75, "num_tokens": 26369532.0, "step": 4798, "train/ce_loss": 0.6659486889839172 }, { "epoch": 0.4743919319754795, "step": 4798, "train/sim_loss": 0.05859375 }, { "epoch": 0.4743919319754795, "step": 4798, "train/total_loss": 0.12518861889839172 }, { "entropy": 9.125642776489258, "epoch": 0.47449080482499506, "mean_token_accuracy": 0.727419376373291, "num_tokens": 26374813.0, "step": 4799, "train/ce_loss": 0.7098814249038696 }, { "epoch": 0.47449080482499506, "step": 4799, "train/sim_loss": 0.0703125 }, { "epoch": 0.47449080482499506, "step": 4799, "train/total_loss": 0.14130064845085144 }, { "epoch": 0.4745896776745106, "grad_norm": 0.8254180550575256, "learning_rate": 8.815952133709143e-06, "loss": 0.1438, "step": 4800 }, { "entropy": 8.917043685913086, "epoch": 0.4745896776745106, "mean_token_accuracy": 0.7538644671440125, "num_tokens": 26380333.0, "step": 4800, "train/ce_loss": 0.7558116912841797 }, { "epoch": 0.4745896776745106, "step": 4800, "train/sim_loss": 0.03515625 }, { "epoch": 0.4745896776745106, "step": 4800, "train/total_loss": 0.11073742061853409 }, { "entropy": 8.750378608703613, "epoch": 0.4746885505240261, "mean_token_accuracy": 0.7450593113899231, "num_tokens": 26386097.0, "step": 4801, "train/ce_loss": 0.9047662019729614 }, { "epoch": 0.4746885505240261, "step": 4801, "train/sim_loss": 0.078125 }, { "epoch": 0.4746885505240261, "step": 4801, "train/total_loss": 0.1686016321182251 }, { "entropy": 8.952716827392578, "epoch": 0.47478742337354163, "mean_token_accuracy": 0.7758433222770691, "num_tokens": 26391583.0, "step": 4802, "train/ce_loss": 0.5883805155754089 }, { "epoch": 0.47478742337354163, "step": 4802, "train/sim_loss": 0.0859375 }, { "epoch": 0.47478742337354163, "step": 4802, "train/total_loss": 0.14477555453777313 }, { "entropy": 9.121000289916992, "epoch": 0.47488629622305717, "mean_token_accuracy": 0.7486486434936523, "num_tokens": 26396865.0, "step": 4803, "train/ce_loss": 0.648277223110199 }, { "epoch": 0.47488629622305717, "step": 4803, "train/sim_loss": 0.046875 }, { "epoch": 0.47488629622305717, "step": 4803, "train/total_loss": 0.11170272529125214 }, { "entropy": 8.687631607055664, "epoch": 0.47498516907257265, "mean_token_accuracy": 0.7295258641242981, "num_tokens": 26402474.0, "step": 4804, "train/ce_loss": 1.015256643295288 }, { "epoch": 0.47498516907257265, "step": 4804, "train/sim_loss": 0.0390625 }, { "epoch": 0.47498516907257265, "step": 4804, "train/total_loss": 0.1405881643295288 }, { "entropy": 8.917742729187012, "epoch": 0.4750840419220882, "mean_token_accuracy": 0.6937738060951233, "num_tokens": 26407921.0, "step": 4805, "train/ce_loss": 1.18875253200531 }, { "epoch": 0.4750840419220882, "step": 4805, "train/sim_loss": 0.05078125 }, { "epoch": 0.4750840419220882, "step": 4805, "train/total_loss": 0.16965651512145996 }, { "entropy": 9.130492210388184, "epoch": 0.47518291477160374, "mean_token_accuracy": 0.8002544641494751, "num_tokens": 26413275.0, "step": 4806, "train/ce_loss": 0.5472060441970825 }, { "epoch": 0.47518291477160374, "step": 4806, "train/sim_loss": 0.0546875 }, { "epoch": 0.47518291477160374, "step": 4806, "train/total_loss": 0.10940811038017273 }, { "entropy": 8.884665489196777, "epoch": 0.4752817876211192, "mean_token_accuracy": 0.7730582356452942, "num_tokens": 26418730.0, "step": 4807, "train/ce_loss": 0.31141042709350586 }, { "epoch": 0.4752817876211192, "step": 4807, "train/sim_loss": 0.0625 }, { "epoch": 0.4752817876211192, "step": 4807, "train/total_loss": 0.09364104270935059 }, { "entropy": 8.950908660888672, "epoch": 0.47538066047063476, "mean_token_accuracy": 0.7285068035125732, "num_tokens": 26424180.0, "step": 4808, "train/ce_loss": 0.5295853018760681 }, { "epoch": 0.47538066047063476, "step": 4808, "train/sim_loss": 0.09375 }, { "epoch": 0.47538066047063476, "step": 4808, "train/total_loss": 0.14670853316783905 }, { "entropy": 8.980205535888672, "epoch": 0.4754795333201503, "mean_token_accuracy": 0.7310344576835632, "num_tokens": 26429693.0, "step": 4809, "train/ce_loss": 0.7864030599594116 }, { "epoch": 0.4754795333201503, "step": 4809, "train/sim_loss": 0.02734375 }, { "epoch": 0.4754795333201503, "step": 4809, "train/total_loss": 0.10598405450582504 }, { "entropy": 8.774349212646484, "epoch": 0.4755784061696658, "mean_token_accuracy": 0.7758793830871582, "num_tokens": 26435496.0, "step": 4810, "train/ce_loss": 0.6389104127883911 }, { "epoch": 0.4755784061696658, "step": 4810, "train/sim_loss": 0.015625 }, { "epoch": 0.4755784061696658, "step": 4810, "train/total_loss": 0.07951604574918747 }, { "entropy": 8.964946746826172, "epoch": 0.47567727901918133, "mean_token_accuracy": 0.7249357104301453, "num_tokens": 26440882.0, "step": 4811, "train/ce_loss": 0.5785752534866333 }, { "epoch": 0.47567727901918133, "step": 4811, "train/sim_loss": 0.046875 }, { "epoch": 0.47567727901918133, "step": 4811, "train/total_loss": 0.10473252832889557 }, { "entropy": 9.195218086242676, "epoch": 0.4757761518686969, "mean_token_accuracy": 0.7452415823936462, "num_tokens": 26446138.0, "step": 4812, "train/ce_loss": 0.5540636777877808 }, { "epoch": 0.4757761518686969, "step": 4812, "train/sim_loss": 0.046875 }, { "epoch": 0.4757761518686969, "step": 4812, "train/total_loss": 0.1022813692688942 }, { "entropy": 8.858753204345703, "epoch": 0.47587502471821236, "mean_token_accuracy": 0.7497404217720032, "num_tokens": 26451659.0, "step": 4813, "train/ce_loss": 1.038857340812683 }, { "epoch": 0.47587502471821236, "step": 4813, "train/sim_loss": 0.0703125 }, { "epoch": 0.47587502471821236, "step": 4813, "train/total_loss": 0.1741982400417328 }, { "entropy": 9.029539108276367, "epoch": 0.4759738975677279, "mean_token_accuracy": 0.7286652326583862, "num_tokens": 26457215.0, "step": 4814, "train/ce_loss": 1.0711904764175415 }, { "epoch": 0.4759738975677279, "step": 4814, "train/sim_loss": 0.05078125 }, { "epoch": 0.4759738975677279, "step": 4814, "train/total_loss": 0.15790030360221863 }, { "entropy": 8.565958023071289, "epoch": 0.47607277041724344, "mean_token_accuracy": 0.7748690843582153, "num_tokens": 26462876.0, "step": 4815, "train/ce_loss": 0.5236809849739075 }, { "epoch": 0.47607277041724344, "step": 4815, "train/sim_loss": 0.0390625 }, { "epoch": 0.47607277041724344, "step": 4815, "train/total_loss": 0.09143060445785522 }, { "entropy": 8.862764358520508, "epoch": 0.4761716432667589, "mean_token_accuracy": 0.7627737522125244, "num_tokens": 26468334.0, "step": 4816, "train/ce_loss": 0.5970073938369751 }, { "epoch": 0.4761716432667589, "step": 4816, "train/sim_loss": 0.0703125 }, { "epoch": 0.4761716432667589, "step": 4816, "train/total_loss": 0.13001324236392975 }, { "entropy": 8.74708366394043, "epoch": 0.47627051611627447, "mean_token_accuracy": 0.7479091882705688, "num_tokens": 26473723.0, "step": 4817, "train/ce_loss": 1.1339623928070068 }, { "epoch": 0.47627051611627447, "step": 4817, "train/sim_loss": 0.05078125 }, { "epoch": 0.47627051611627447, "step": 4817, "train/total_loss": 0.16417749226093292 }, { "entropy": 8.821216583251953, "epoch": 0.47636938896579, "mean_token_accuracy": 0.7261761426925659, "num_tokens": 26479212.0, "step": 4818, "train/ce_loss": 0.922658383846283 }, { "epoch": 0.47636938896579, "step": 4818, "train/sim_loss": 0.109375 }, { "epoch": 0.47636938896579, "step": 4818, "train/total_loss": 0.20164084434509277 }, { "entropy": 9.05622673034668, "epoch": 0.4764682618153055, "mean_token_accuracy": 0.7992424368858337, "num_tokens": 26484608.0, "step": 4819, "train/ce_loss": 0.5184563398361206 }, { "epoch": 0.4764682618153055, "step": 4819, "train/sim_loss": 0.015625 }, { "epoch": 0.4764682618153055, "step": 4819, "train/total_loss": 0.06747063994407654 }, { "epoch": 0.47656713466482103, "grad_norm": 0.6893042325973511, "learning_rate": 8.811007268951196e-06, "loss": 0.1334, "step": 4820 }, { "entropy": 8.693737983703613, "epoch": 0.47656713466482103, "mean_token_accuracy": 0.7515212893486023, "num_tokens": 26490394.0, "step": 4820, "train/ce_loss": 1.1700669527053833 }, { "epoch": 0.47656713466482103, "step": 4820, "train/sim_loss": 0.109375 }, { "epoch": 0.47656713466482103, "step": 4820, "train/total_loss": 0.22638168931007385 }, { "entropy": 9.016275405883789, "epoch": 0.4766660075143366, "mean_token_accuracy": 0.7700729966163635, "num_tokens": 26495768.0, "step": 4821, "train/ce_loss": 0.5714148283004761 }, { "epoch": 0.4766660075143366, "step": 4821, "train/sim_loss": 0.046875 }, { "epoch": 0.4766660075143366, "step": 4821, "train/total_loss": 0.10401648283004761 }, { "entropy": 8.824657440185547, "epoch": 0.47676488036385206, "mean_token_accuracy": 0.7271783947944641, "num_tokens": 26501414.0, "step": 4822, "train/ce_loss": 1.0439934730529785 }, { "epoch": 0.47676488036385206, "step": 4822, "train/sim_loss": 0.0859375 }, { "epoch": 0.47676488036385206, "step": 4822, "train/total_loss": 0.19033685326576233 }, { "entropy": 8.907363891601562, "epoch": 0.4768637532133676, "mean_token_accuracy": 0.7572559118270874, "num_tokens": 26506834.0, "step": 4823, "train/ce_loss": 0.6916400790214539 }, { "epoch": 0.4768637532133676, "step": 4823, "train/sim_loss": 0.078125 }, { "epoch": 0.4768637532133676, "step": 4823, "train/total_loss": 0.14728900790214539 }, { "entropy": 8.790176391601562, "epoch": 0.47696262606288314, "mean_token_accuracy": 0.7363128662109375, "num_tokens": 26512380.0, "step": 4824, "train/ce_loss": 0.6803215146064758 }, { "epoch": 0.47696262606288314, "step": 4824, "train/sim_loss": 0.078125 }, { "epoch": 0.47696262606288314, "step": 4824, "train/total_loss": 0.1461571455001831 }, { "entropy": 9.095775604248047, "epoch": 0.47706149891239863, "mean_token_accuracy": 0.8073285818099976, "num_tokens": 26517807.0, "step": 4825, "train/ce_loss": 0.5705654621124268 }, { "epoch": 0.47706149891239863, "step": 4825, "train/sim_loss": 0.046875 }, { "epoch": 0.47706149891239863, "step": 4825, "train/total_loss": 0.10393154621124268 }, { "entropy": 8.76365852355957, "epoch": 0.47716037176191417, "mean_token_accuracy": 0.7338308691978455, "num_tokens": 26523174.0, "step": 4826, "train/ce_loss": 1.1847978830337524 }, { "epoch": 0.47716037176191417, "step": 4826, "train/sim_loss": 0.078125 }, { "epoch": 0.47716037176191417, "step": 4826, "train/total_loss": 0.19660478830337524 }, { "entropy": 8.88325309753418, "epoch": 0.4772592446114297, "mean_token_accuracy": 0.7627118825912476, "num_tokens": 26528575.0, "step": 4827, "train/ce_loss": 0.5684431195259094 }, { "epoch": 0.4772592446114297, "step": 4827, "train/sim_loss": 0.0390625 }, { "epoch": 0.4772592446114297, "step": 4827, "train/total_loss": 0.0959068089723587 }, { "entropy": 9.053342819213867, "epoch": 0.4773581174609452, "mean_token_accuracy": 0.7997481226921082, "num_tokens": 26534081.0, "step": 4828, "train/ce_loss": 0.7912935018539429 }, { "epoch": 0.4773581174609452, "step": 4828, "train/sim_loss": 0.07421875 }, { "epoch": 0.4773581174609452, "step": 4828, "train/total_loss": 0.15334810316562653 }, { "entropy": 9.125665664672852, "epoch": 0.47745699031046074, "mean_token_accuracy": 0.7068741917610168, "num_tokens": 26539457.0, "step": 4829, "train/ce_loss": 0.8126822113990784 }, { "epoch": 0.47745699031046074, "step": 4829, "train/sim_loss": 0.05859375 }, { "epoch": 0.47745699031046074, "step": 4829, "train/total_loss": 0.13986197113990784 }, { "entropy": 8.986710548400879, "epoch": 0.4775558631599763, "mean_token_accuracy": 0.7590758800506592, "num_tokens": 26544964.0, "step": 4830, "train/ce_loss": 1.0757285356521606 }, { "epoch": 0.4775558631599763, "step": 4830, "train/sim_loss": 0.046875 }, { "epoch": 0.4775558631599763, "step": 4830, "train/total_loss": 0.15444785356521606 }, { "entropy": 9.054449081420898, "epoch": 0.4776547360094918, "mean_token_accuracy": 0.7559681534767151, "num_tokens": 26550302.0, "step": 4831, "train/ce_loss": 0.9236979484558105 }, { "epoch": 0.4776547360094918, "step": 4831, "train/sim_loss": 0.046875 }, { "epoch": 0.4776547360094918, "step": 4831, "train/total_loss": 0.13924479484558105 }, { "entropy": 8.73752212524414, "epoch": 0.4777536088590073, "mean_token_accuracy": 0.7822839021682739, "num_tokens": 26555796.0, "step": 4832, "train/ce_loss": 0.7781738042831421 }, { "epoch": 0.4777536088590073, "step": 4832, "train/sim_loss": 0.0703125 }, { "epoch": 0.4777536088590073, "step": 4832, "train/total_loss": 0.1481298804283142 }, { "entropy": 8.88547134399414, "epoch": 0.47785248170852285, "mean_token_accuracy": 0.7167463898658752, "num_tokens": 26561441.0, "step": 4833, "train/ce_loss": 0.8586369752883911 }, { "epoch": 0.47785248170852285, "step": 4833, "train/sim_loss": 0.0390625 }, { "epoch": 0.47785248170852285, "step": 4833, "train/total_loss": 0.12492620199918747 }, { "entropy": 8.36124038696289, "epoch": 0.4779513545580384, "mean_token_accuracy": 0.7270606756210327, "num_tokens": 26567258.0, "step": 4834, "train/ce_loss": 0.3552090525627136 }, { "epoch": 0.4779513545580384, "step": 4834, "train/sim_loss": 0.0546875 }, { "epoch": 0.4779513545580384, "step": 4834, "train/total_loss": 0.09020841121673584 }, { "entropy": 8.973974227905273, "epoch": 0.4780502274075539, "mean_token_accuracy": 0.7568181753158569, "num_tokens": 26572805.0, "step": 4835, "train/ce_loss": 0.5505388379096985 }, { "epoch": 0.4780502274075539, "step": 4835, "train/sim_loss": 0.05859375 }, { "epoch": 0.4780502274075539, "step": 4835, "train/total_loss": 0.11364763975143433 }, { "entropy": 8.939773559570312, "epoch": 0.4781491002570694, "mean_token_accuracy": 0.6632183790206909, "num_tokens": 26578449.0, "step": 4836, "train/ce_loss": 1.1183664798736572 }, { "epoch": 0.4781491002570694, "step": 4836, "train/sim_loss": 0.0625 }, { "epoch": 0.4781491002570694, "step": 4836, "train/total_loss": 0.17433664202690125 }, { "entropy": 9.156744003295898, "epoch": 0.47824797310658496, "mean_token_accuracy": 0.7450980544090271, "num_tokens": 26583819.0, "step": 4837, "train/ce_loss": 0.5776577591896057 }, { "epoch": 0.47824797310658496, "step": 4837, "train/sim_loss": 0.0625 }, { "epoch": 0.47824797310658496, "step": 4837, "train/total_loss": 0.12026578187942505 }, { "entropy": 9.01421070098877, "epoch": 0.47834684595610044, "mean_token_accuracy": 0.8201856017112732, "num_tokens": 26589227.0, "step": 4838, "train/ce_loss": 0.45512980222702026 }, { "epoch": 0.47834684595610044, "step": 4838, "train/sim_loss": 0.0390625 }, { "epoch": 0.47834684595610044, "step": 4838, "train/total_loss": 0.08457548171281815 }, { "entropy": 9.21083927154541, "epoch": 0.478445718805616, "mean_token_accuracy": 0.7403846383094788, "num_tokens": 26594598.0, "step": 4839, "train/ce_loss": 0.6735528707504272 }, { "epoch": 0.478445718805616, "step": 4839, "train/sim_loss": 0.09375 }, { "epoch": 0.478445718805616, "step": 4839, "train/total_loss": 0.16110529005527496 }, { "epoch": 0.4785445916551315, "grad_norm": 0.690865695476532, "learning_rate": 8.806062404193246e-06, "loss": 0.1382, "step": 4840 }, { "entropy": 9.151002883911133, "epoch": 0.4785445916551315, "mean_token_accuracy": 0.7290167808532715, "num_tokens": 26599978.0, "step": 4840, "train/ce_loss": 0.5342200994491577 }, { "epoch": 0.4785445916551315, "step": 4840, "train/sim_loss": 0.08203125 }, { "epoch": 0.4785445916551315, "step": 4840, "train/total_loss": 0.1354532539844513 }, { "entropy": 9.324448585510254, "epoch": 0.478643464504647, "mean_token_accuracy": 0.6666666865348816, "num_tokens": 26605305.0, "step": 4841, "train/ce_loss": 1.8055071830749512 }, { "epoch": 0.478643464504647, "step": 4841, "train/sim_loss": 0.14453125 }, { "epoch": 0.478643464504647, "step": 4841, "train/total_loss": 0.3250819742679596 }, { "entropy": 8.87490463256836, "epoch": 0.47874233735416255, "mean_token_accuracy": 0.7281106114387512, "num_tokens": 26610989.0, "step": 4842, "train/ce_loss": 2.0154595375061035 }, { "epoch": 0.47874233735416255, "step": 4842, "train/sim_loss": 0.078125 }, { "epoch": 0.47874233735416255, "step": 4842, "train/total_loss": 0.27967095375061035 }, { "entropy": 8.602423667907715, "epoch": 0.4788412102036781, "mean_token_accuracy": 0.8227227926254272, "num_tokens": 26616634.0, "step": 4843, "train/ce_loss": 0.3566347062587738 }, { "epoch": 0.4788412102036781, "step": 4843, "train/sim_loss": 0.01953125 }, { "epoch": 0.4788412102036781, "step": 4843, "train/total_loss": 0.05519472062587738 }, { "entropy": 8.924342155456543, "epoch": 0.4789400830531936, "mean_token_accuracy": 0.7752525210380554, "num_tokens": 26622060.0, "step": 4844, "train/ce_loss": 0.600833535194397 }, { "epoch": 0.4789400830531936, "step": 4844, "train/sim_loss": 0.109375 }, { "epoch": 0.4789400830531936, "step": 4844, "train/total_loss": 0.16945835947990417 }, { "entropy": 9.005697250366211, "epoch": 0.4790389559027091, "mean_token_accuracy": 0.7394594550132751, "num_tokens": 26627644.0, "step": 4845, "train/ce_loss": 1.3627570867538452 }, { "epoch": 0.4790389559027091, "step": 4845, "train/sim_loss": 0.11328125 }, { "epoch": 0.4790389559027091, "step": 4845, "train/total_loss": 0.24955695867538452 }, { "entropy": 8.668819427490234, "epoch": 0.47913782875222466, "mean_token_accuracy": 0.7682926654815674, "num_tokens": 26633253.0, "step": 4846, "train/ce_loss": 0.9800294041633606 }, { "epoch": 0.47913782875222466, "step": 4846, "train/sim_loss": 0.07421875 }, { "epoch": 0.47913782875222466, "step": 4846, "train/total_loss": 0.17222169041633606 }, { "entropy": 8.374906539916992, "epoch": 0.47923670160174014, "mean_token_accuracy": 0.7200000286102295, "num_tokens": 26638809.0, "step": 4847, "train/ce_loss": 0.4461544454097748 }, { "epoch": 0.47923670160174014, "step": 4847, "train/sim_loss": 0.11328125 }, { "epoch": 0.47923670160174014, "step": 4847, "train/total_loss": 0.15789669752120972 }, { "entropy": 8.370838165283203, "epoch": 0.4793355744512557, "mean_token_accuracy": 0.6826666593551636, "num_tokens": 26644677.0, "step": 4848, "train/ce_loss": 1.1361217498779297 }, { "epoch": 0.4793355744512557, "step": 4848, "train/sim_loss": 0.171875 }, { "epoch": 0.4793355744512557, "step": 4848, "train/total_loss": 0.28548717498779297 }, { "entropy": 8.949868202209473, "epoch": 0.4794344473007712, "mean_token_accuracy": 0.7831021547317505, "num_tokens": 26650059.0, "step": 4849, "train/ce_loss": 0.720520555973053 }, { "epoch": 0.4794344473007712, "step": 4849, "train/sim_loss": 0.046875 }, { "epoch": 0.4794344473007712, "step": 4849, "train/total_loss": 0.11892705410718918 }, { "entropy": 9.073249816894531, "epoch": 0.4795333201502867, "mean_token_accuracy": 0.7460317611694336, "num_tokens": 26655438.0, "step": 4850, "train/ce_loss": 0.8170648813247681 }, { "epoch": 0.4795333201502867, "step": 4850, "train/sim_loss": 0.08203125 }, { "epoch": 0.4795333201502867, "step": 4850, "train/total_loss": 0.16373774409294128 }, { "entropy": 9.183783531188965, "epoch": 0.47963219299980225, "mean_token_accuracy": 0.7455782294273376, "num_tokens": 26660776.0, "step": 4851, "train/ce_loss": 1.3108309507369995 }, { "epoch": 0.47963219299980225, "step": 4851, "train/sim_loss": 0.08984375 }, { "epoch": 0.47963219299980225, "step": 4851, "train/total_loss": 0.22092685103416443 }, { "entropy": 8.996363639831543, "epoch": 0.4797310658493178, "mean_token_accuracy": 0.7776484489440918, "num_tokens": 26666259.0, "step": 4852, "train/ce_loss": 0.6336336135864258 }, { "epoch": 0.4797310658493178, "step": 4852, "train/sim_loss": 0.06640625 }, { "epoch": 0.4797310658493178, "step": 4852, "train/total_loss": 0.12976962327957153 }, { "entropy": 8.964442253112793, "epoch": 0.4798299386988333, "mean_token_accuracy": 0.7628607153892517, "num_tokens": 26671685.0, "step": 4853, "train/ce_loss": 0.5975455641746521 }, { "epoch": 0.4798299386988333, "step": 4853, "train/sim_loss": 0.05859375 }, { "epoch": 0.4798299386988333, "step": 4853, "train/total_loss": 0.11834830790758133 }, { "entropy": 8.009866714477539, "epoch": 0.4799288115483488, "mean_token_accuracy": 0.7622832655906677, "num_tokens": 26677699.0, "step": 4854, "train/ce_loss": 0.25168997049331665 }, { "epoch": 0.4799288115483488, "step": 4854, "train/sim_loss": 0.02734375 }, { "epoch": 0.4799288115483488, "step": 4854, "train/total_loss": 0.052512750029563904 }, { "entropy": 8.716022491455078, "epoch": 0.48002768439786436, "mean_token_accuracy": 0.7613412141799927, "num_tokens": 26683315.0, "step": 4855, "train/ce_loss": 0.3447045385837555 }, { "epoch": 0.48002768439786436, "step": 4855, "train/sim_loss": 0.046875 }, { "epoch": 0.48002768439786436, "step": 4855, "train/total_loss": 0.08134545385837555 }, { "entropy": 8.923629760742188, "epoch": 0.48012655724737985, "mean_token_accuracy": 0.7941176295280457, "num_tokens": 26688852.0, "step": 4856, "train/ce_loss": 0.3732772767543793 }, { "epoch": 0.48012655724737985, "step": 4856, "train/sim_loss": 0.0390625 }, { "epoch": 0.48012655724737985, "step": 4856, "train/total_loss": 0.07639022916555405 }, { "entropy": 8.890054702758789, "epoch": 0.4802254300968954, "mean_token_accuracy": 0.6918238997459412, "num_tokens": 26694258.0, "step": 4857, "train/ce_loss": 0.7974574565887451 }, { "epoch": 0.4802254300968954, "step": 4857, "train/sim_loss": 0.08203125 }, { "epoch": 0.4802254300968954, "step": 4857, "train/total_loss": 0.16177698969841003 }, { "entropy": 8.687600135803223, "epoch": 0.48032430294641093, "mean_token_accuracy": 0.8024072051048279, "num_tokens": 26699877.0, "step": 4858, "train/ce_loss": 0.4485751986503601 }, { "epoch": 0.48032430294641093, "step": 4858, "train/sim_loss": 0.0234375 }, { "epoch": 0.48032430294641093, "step": 4858, "train/total_loss": 0.06829501688480377 }, { "entropy": 9.077905654907227, "epoch": 0.4804231757959264, "mean_token_accuracy": 0.748911440372467, "num_tokens": 26705213.0, "step": 4859, "train/ce_loss": 0.8618593811988831 }, { "epoch": 0.4804231757959264, "step": 4859, "train/sim_loss": 0.0859375 }, { "epoch": 0.4804231757959264, "step": 4859, "train/total_loss": 0.17212343215942383 }, { "epoch": 0.48052204864544196, "grad_norm": 0.7602890133857727, "learning_rate": 8.801117539435298e-06, "loss": 0.1386, "step": 4860 }, { "entropy": 8.808164596557617, "epoch": 0.48052204864544196, "mean_token_accuracy": 0.779030442237854, "num_tokens": 26710716.0, "step": 4860, "train/ce_loss": 0.4421530067920685 }, { "epoch": 0.48052204864544196, "step": 4860, "train/sim_loss": 0.03515625 }, { "epoch": 0.48052204864544196, "step": 4860, "train/total_loss": 0.07937155663967133 }, { "entropy": 8.851707458496094, "epoch": 0.4806209214949575, "mean_token_accuracy": 0.7447306513786316, "num_tokens": 26716167.0, "step": 4861, "train/ce_loss": 0.8272843956947327 }, { "epoch": 0.4806209214949575, "step": 4861, "train/sim_loss": 0.0703125 }, { "epoch": 0.4806209214949575, "step": 4861, "train/total_loss": 0.15304094552993774 }, { "entropy": 9.007905960083008, "epoch": 0.480719794344473, "mean_token_accuracy": 0.7836477756500244, "num_tokens": 26721418.0, "step": 4862, "train/ce_loss": 0.6565971970558167 }, { "epoch": 0.480719794344473, "step": 4862, "train/sim_loss": 0.03125 }, { "epoch": 0.480719794344473, "step": 4862, "train/total_loss": 0.09690972417593002 }, { "entropy": 8.826101303100586, "epoch": 0.4808186671939885, "mean_token_accuracy": 0.7366071343421936, "num_tokens": 26727010.0, "step": 4863, "train/ce_loss": 1.4405766725540161 }, { "epoch": 0.4808186671939885, "step": 4863, "train/sim_loss": 0.06640625 }, { "epoch": 0.4808186671939885, "step": 4863, "train/total_loss": 0.21046392619609833 }, { "entropy": 8.905471801757812, "epoch": 0.48091754004350407, "mean_token_accuracy": 0.7752525210380554, "num_tokens": 26732420.0, "step": 4864, "train/ce_loss": 0.42263123393058777 }, { "epoch": 0.48091754004350407, "step": 4864, "train/sim_loss": 0.15625 }, { "epoch": 0.48091754004350407, "step": 4864, "train/total_loss": 0.19851312041282654 }, { "entropy": 8.79180908203125, "epoch": 0.48101641289301955, "mean_token_accuracy": 0.7269267439842224, "num_tokens": 26738129.0, "step": 4865, "train/ce_loss": 1.356407880783081 }, { "epoch": 0.48101641289301955, "step": 4865, "train/sim_loss": 0.07421875 }, { "epoch": 0.48101641289301955, "step": 4865, "train/total_loss": 0.20985953509807587 }, { "entropy": 8.856132507324219, "epoch": 0.4811152857425351, "mean_token_accuracy": 0.7546511888504028, "num_tokens": 26743547.0, "step": 4866, "train/ce_loss": 0.473410040140152 }, { "epoch": 0.4811152857425351, "step": 4866, "train/sim_loss": 0.0859375 }, { "epoch": 0.4811152857425351, "step": 4866, "train/total_loss": 0.1332785040140152 }, { "entropy": 8.62371826171875, "epoch": 0.48121415859205063, "mean_token_accuracy": 0.7688766121864319, "num_tokens": 26749271.0, "step": 4867, "train/ce_loss": 0.7767529487609863 }, { "epoch": 0.48121415859205063, "step": 4867, "train/sim_loss": 0.0625 }, { "epoch": 0.48121415859205063, "step": 4867, "train/total_loss": 0.14017529785633087 }, { "entropy": 8.82651138305664, "epoch": 0.4813130314415661, "mean_token_accuracy": 0.7302857041358948, "num_tokens": 26754752.0, "step": 4868, "train/ce_loss": 1.1042704582214355 }, { "epoch": 0.4813130314415661, "step": 4868, "train/sim_loss": 0.046875 }, { "epoch": 0.4813130314415661, "step": 4868, "train/total_loss": 0.15730205178260803 }, { "entropy": 8.733180046081543, "epoch": 0.48141190429108166, "mean_token_accuracy": 0.7417582273483276, "num_tokens": 26760287.0, "step": 4869, "train/ce_loss": 0.5761451721191406 }, { "epoch": 0.48141190429108166, "step": 4869, "train/sim_loss": 0.0546875 }, { "epoch": 0.48141190429108166, "step": 4869, "train/total_loss": 0.1123020201921463 }, { "entropy": 8.740678787231445, "epoch": 0.4815107771405972, "mean_token_accuracy": 0.7735294103622437, "num_tokens": 26765970.0, "step": 4870, "train/ce_loss": 0.723585307598114 }, { "epoch": 0.4815107771405972, "step": 4870, "train/sim_loss": 0.07421875 }, { "epoch": 0.4815107771405972, "step": 4870, "train/total_loss": 0.14657728374004364 }, { "entropy": 8.745899200439453, "epoch": 0.48160964999011274, "mean_token_accuracy": 0.7494736909866333, "num_tokens": 26771542.0, "step": 4871, "train/ce_loss": 0.660119354724884 }, { "epoch": 0.48160964999011274, "step": 4871, "train/sim_loss": 0.0390625 }, { "epoch": 0.48160964999011274, "step": 4871, "train/total_loss": 0.1050744354724884 }, { "entropy": 8.590444564819336, "epoch": 0.48170852283962823, "mean_token_accuracy": 0.6733333468437195, "num_tokens": 26777125.0, "step": 4872, "train/ce_loss": 1.0790762901306152 }, { "epoch": 0.48170852283962823, "step": 4872, "train/sim_loss": 0.11328125 }, { "epoch": 0.48170852283962823, "step": 4872, "train/total_loss": 0.22118887305259705 }, { "entropy": 8.48228931427002, "epoch": 0.48180739568914377, "mean_token_accuracy": 0.6978879570960999, "num_tokens": 26782905.0, "step": 4873, "train/ce_loss": 1.176464319229126 }, { "epoch": 0.48180739568914377, "step": 4873, "train/sim_loss": 0.0546875 }, { "epoch": 0.48180739568914377, "step": 4873, "train/total_loss": 0.17233392596244812 }, { "entropy": 8.442096710205078, "epoch": 0.4819062685386593, "mean_token_accuracy": 0.748031497001648, "num_tokens": 26788722.0, "step": 4874, "train/ce_loss": 0.8657415509223938 }, { "epoch": 0.4819062685386593, "step": 4874, "train/sim_loss": 0.05078125 }, { "epoch": 0.4819062685386593, "step": 4874, "train/total_loss": 0.13735541701316833 }, { "entropy": 9.097074508666992, "epoch": 0.4820051413881748, "mean_token_accuracy": 0.7516254782676697, "num_tokens": 26794033.0, "step": 4875, "train/ce_loss": 0.541778028011322 }, { "epoch": 0.4820051413881748, "step": 4875, "train/sim_loss": 0.05859375 }, { "epoch": 0.4820051413881748, "step": 4875, "train/total_loss": 0.11277155578136444 }, { "entropy": 9.277182579040527, "epoch": 0.48210401423769034, "mean_token_accuracy": 0.7378336191177368, "num_tokens": 26799243.0, "step": 4876, "train/ce_loss": 0.6035544276237488 }, { "epoch": 0.48210401423769034, "step": 4876, "train/sim_loss": 0.0625 }, { "epoch": 0.48210401423769034, "step": 4876, "train/total_loss": 0.12285543978214264 }, { "entropy": 8.935158729553223, "epoch": 0.4822028870872059, "mean_token_accuracy": 0.7585033774375916, "num_tokens": 26804719.0, "step": 4877, "train/ce_loss": 0.8342577815055847 }, { "epoch": 0.4822028870872059, "step": 4877, "train/sim_loss": 0.046875 }, { "epoch": 0.4822028870872059, "step": 4877, "train/total_loss": 0.13030079007148743 }, { "entropy": 9.231586456298828, "epoch": 0.48230175993672136, "mean_token_accuracy": 0.7054491639137268, "num_tokens": 26810015.0, "step": 4878, "train/ce_loss": 0.7113472819328308 }, { "epoch": 0.48230175993672136, "step": 4878, "train/sim_loss": 0.04296875 }, { "epoch": 0.48230175993672136, "step": 4878, "train/total_loss": 0.11410348117351532 }, { "entropy": 8.899628639221191, "epoch": 0.4824006327862369, "mean_token_accuracy": 0.7588306665420532, "num_tokens": 26815445.0, "step": 4879, "train/ce_loss": 0.4834241271018982 }, { "epoch": 0.4824006327862369, "step": 4879, "train/sim_loss": 0.03125 }, { "epoch": 0.4824006327862369, "step": 4879, "train/total_loss": 0.07959241420030594 }, { "epoch": 0.48249950563575245, "grad_norm": 0.7154139280319214, "learning_rate": 8.796172674677347e-06, "loss": 0.1373, "step": 4880 }, { "entropy": 8.5283842086792, "epoch": 0.48249950563575245, "mean_token_accuracy": 0.7422266602516174, "num_tokens": 26821047.0, "step": 4880, "train/ce_loss": 0.9614080190658569 }, { "epoch": 0.48249950563575245, "step": 4880, "train/sim_loss": 0.04296875 }, { "epoch": 0.48249950563575245, "step": 4880, "train/total_loss": 0.1391095519065857 }, { "entropy": 9.080994606018066, "epoch": 0.48259837848526793, "mean_token_accuracy": 0.7088451981544495, "num_tokens": 26826423.0, "step": 4881, "train/ce_loss": 0.6087433695793152 }, { "epoch": 0.48259837848526793, "step": 4881, "train/sim_loss": 0.046875 }, { "epoch": 0.48259837848526793, "step": 4881, "train/total_loss": 0.107749342918396 }, { "entropy": 9.121914863586426, "epoch": 0.4826972513347835, "mean_token_accuracy": 0.7759783864021301, "num_tokens": 26831786.0, "step": 4882, "train/ce_loss": 0.807715892791748 }, { "epoch": 0.4826972513347835, "step": 4882, "train/sim_loss": 0.11328125 }, { "epoch": 0.4826972513347835, "step": 4882, "train/total_loss": 0.19405284523963928 }, { "entropy": 8.53615665435791, "epoch": 0.482796124184299, "mean_token_accuracy": 0.6967015266418457, "num_tokens": 26837606.0, "step": 4883, "train/ce_loss": 0.30985164642333984 }, { "epoch": 0.482796124184299, "step": 4883, "train/sim_loss": 0.0234375 }, { "epoch": 0.482796124184299, "step": 4883, "train/total_loss": 0.054422665387392044 }, { "entropy": 8.839836120605469, "epoch": 0.4828949970338145, "mean_token_accuracy": 0.7540279030799866, "num_tokens": 26843226.0, "step": 4884, "train/ce_loss": 0.663811981678009 }, { "epoch": 0.4828949970338145, "step": 4884, "train/sim_loss": 0.03515625 }, { "epoch": 0.4828949970338145, "step": 4884, "train/total_loss": 0.10153745114803314 }, { "entropy": 9.117998123168945, "epoch": 0.48299386988333004, "mean_token_accuracy": 0.7782486081123352, "num_tokens": 26848563.0, "step": 4885, "train/ce_loss": 0.6376133561134338 }, { "epoch": 0.48299386988333004, "step": 4885, "train/sim_loss": 0.03125 }, { "epoch": 0.48299386988333004, "step": 4885, "train/total_loss": 0.09501133859157562 }, { "entropy": 9.016672134399414, "epoch": 0.4830927427328456, "mean_token_accuracy": 0.7434679269790649, "num_tokens": 26854030.0, "step": 4886, "train/ce_loss": 1.7007310390472412 }, { "epoch": 0.4830927427328456, "step": 4886, "train/sim_loss": 0.109375 }, { "epoch": 0.4830927427328456, "step": 4886, "train/total_loss": 0.27944809198379517 }, { "entropy": 8.500059127807617, "epoch": 0.48319161558236107, "mean_token_accuracy": 0.7526316046714783, "num_tokens": 26859603.0, "step": 4887, "train/ce_loss": 0.9412975311279297 }, { "epoch": 0.48319161558236107, "step": 4887, "train/sim_loss": 0.0625 }, { "epoch": 0.48319161558236107, "step": 4887, "train/total_loss": 0.1566297560930252 }, { "entropy": 9.257912635803223, "epoch": 0.4832904884318766, "mean_token_accuracy": 0.7460317611694336, "num_tokens": 26864920.0, "step": 4888, "train/ce_loss": 0.5095155835151672 }, { "epoch": 0.4832904884318766, "step": 4888, "train/sim_loss": 0.0625 }, { "epoch": 0.4832904884318766, "step": 4888, "train/total_loss": 0.11345155537128448 }, { "entropy": 9.197246551513672, "epoch": 0.48338936128139215, "mean_token_accuracy": 0.7731214165687561, "num_tokens": 26870179.0, "step": 4889, "train/ce_loss": 0.39534127712249756 }, { "epoch": 0.48338936128139215, "step": 4889, "train/sim_loss": 0.0390625 }, { "epoch": 0.48338936128139215, "step": 4889, "train/total_loss": 0.07859662920236588 }, { "entropy": 8.683895111083984, "epoch": 0.48348823413090763, "mean_token_accuracy": 0.7447916865348816, "num_tokens": 26875937.0, "step": 4890, "train/ce_loss": 0.3428470492362976 }, { "epoch": 0.48348823413090763, "step": 4890, "train/sim_loss": 0.015625 }, { "epoch": 0.48348823413090763, "step": 4890, "train/total_loss": 0.04990970715880394 }, { "entropy": 8.84736442565918, "epoch": 0.4835871069804232, "mean_token_accuracy": 0.7047619223594666, "num_tokens": 26881494.0, "step": 4891, "train/ce_loss": 0.541689395904541 }, { "epoch": 0.4835871069804232, "step": 4891, "train/sim_loss": 0.05078125 }, { "epoch": 0.4835871069804232, "step": 4891, "train/total_loss": 0.1049501895904541 }, { "entropy": 8.968128204345703, "epoch": 0.4836859798299387, "mean_token_accuracy": 0.7095046639442444, "num_tokens": 26886867.0, "step": 4892, "train/ce_loss": 1.3482760190963745 }, { "epoch": 0.4836859798299387, "step": 4892, "train/sim_loss": 0.09765625 }, { "epoch": 0.4836859798299387, "step": 4892, "train/total_loss": 0.2324838489294052 }, { "entropy": 9.114463806152344, "epoch": 0.4837848526794542, "mean_token_accuracy": 0.7422062158584595, "num_tokens": 26892365.0, "step": 4893, "train/ce_loss": 1.150546908378601 }, { "epoch": 0.4837848526794542, "step": 4893, "train/sim_loss": 0.08203125 }, { "epoch": 0.4837848526794542, "step": 4893, "train/total_loss": 0.19708594679832458 }, { "entropy": 8.408669471740723, "epoch": 0.48388372552896974, "mean_token_accuracy": 0.7388646006584167, "num_tokens": 26898212.0, "step": 4894, "train/ce_loss": 1.0537893772125244 }, { "epoch": 0.48388372552896974, "step": 4894, "train/sim_loss": 0.046875 }, { "epoch": 0.48388372552896974, "step": 4894, "train/total_loss": 0.15225394070148468 }, { "entropy": 9.301807403564453, "epoch": 0.4839825983784853, "mean_token_accuracy": 0.7381864786148071, "num_tokens": 26903568.0, "step": 4895, "train/ce_loss": 0.7628718018531799 }, { "epoch": 0.4839825983784853, "step": 4895, "train/sim_loss": 0.0390625 }, { "epoch": 0.4839825983784853, "step": 4895, "train/total_loss": 0.115349680185318 }, { "entropy": 8.966933250427246, "epoch": 0.48408147122800077, "mean_token_accuracy": 0.7338362336158752, "num_tokens": 26909173.0, "step": 4896, "train/ce_loss": 0.38323739171028137 }, { "epoch": 0.48408147122800077, "step": 4896, "train/sim_loss": 0.09375 }, { "epoch": 0.48408147122800077, "step": 4896, "train/total_loss": 0.13207374513149261 }, { "entropy": 8.96944808959961, "epoch": 0.4841803440775163, "mean_token_accuracy": 0.7652733325958252, "num_tokens": 26914754.0, "step": 4897, "train/ce_loss": 0.7741009593009949 }, { "epoch": 0.4841803440775163, "step": 4897, "train/sim_loss": 0.06640625 }, { "epoch": 0.4841803440775163, "step": 4897, "train/total_loss": 0.14381635189056396 }, { "entropy": 8.633848190307617, "epoch": 0.48427921692703185, "mean_token_accuracy": 0.7679924368858337, "num_tokens": 26920436.0, "step": 4898, "train/ce_loss": 0.35862302780151367 }, { "epoch": 0.48427921692703185, "step": 4898, "train/sim_loss": 0.02734375 }, { "epoch": 0.48427921692703185, "step": 4898, "train/total_loss": 0.06320605427026749 }, { "entropy": 9.334051132202148, "epoch": 0.48437808977654734, "mean_token_accuracy": 0.7240896224975586, "num_tokens": 26925809.0, "step": 4899, "train/ce_loss": 0.6549046635627747 }, { "epoch": 0.48437808977654734, "step": 4899, "train/sim_loss": 0.04296875 }, { "epoch": 0.48437808977654734, "step": 4899, "train/total_loss": 0.1084592193365097 }, { "epoch": 0.4844769626260629, "grad_norm": 0.7226547002792358, "learning_rate": 8.7912278099194e-06, "loss": 0.1398, "step": 4900 }, { "entropy": 9.116944313049316, "epoch": 0.4844769626260629, "mean_token_accuracy": 0.7467532753944397, "num_tokens": 26931259.0, "step": 4900, "train/ce_loss": 0.46073371171951294 }, { "epoch": 0.4844769626260629, "step": 4900, "train/sim_loss": 0.01953125 }, { "epoch": 0.4844769626260629, "step": 4900, "train/total_loss": 0.06560462713241577 }, { "entropy": 8.924238204956055, "epoch": 0.4845758354755784, "mean_token_accuracy": 0.7705099582672119, "num_tokens": 26936767.0, "step": 4901, "train/ce_loss": 0.5779252052307129 }, { "epoch": 0.4845758354755784, "step": 4901, "train/sim_loss": 0.015625 }, { "epoch": 0.4845758354755784, "step": 4901, "train/total_loss": 0.07341752201318741 }, { "entropy": 8.9938383102417, "epoch": 0.4846747083250939, "mean_token_accuracy": 0.6946107745170593, "num_tokens": 26942141.0, "step": 4902, "train/ce_loss": 0.8005409240722656 }, { "epoch": 0.4846747083250939, "step": 4902, "train/sim_loss": 0.0703125 }, { "epoch": 0.4846747083250939, "step": 4902, "train/total_loss": 0.15036660432815552 }, { "entropy": 8.59581184387207, "epoch": 0.48477358117460945, "mean_token_accuracy": 0.7394297122955322, "num_tokens": 26947816.0, "step": 4903, "train/ce_loss": 0.6642596125602722 }, { "epoch": 0.48477358117460945, "step": 4903, "train/sim_loss": 0.16015625 }, { "epoch": 0.48477358117460945, "step": 4903, "train/total_loss": 0.22658221423625946 }, { "entropy": 8.923118591308594, "epoch": 0.484872454024125, "mean_token_accuracy": 0.7330960631370544, "num_tokens": 26953352.0, "step": 4904, "train/ce_loss": 0.9230614304542542 }, { "epoch": 0.484872454024125, "step": 4904, "train/sim_loss": 0.12109375 }, { "epoch": 0.484872454024125, "step": 4904, "train/total_loss": 0.21339988708496094 }, { "entropy": 8.712615966796875, "epoch": 0.4849713268736405, "mean_token_accuracy": 0.7382246255874634, "num_tokens": 26959050.0, "step": 4905, "train/ce_loss": 0.3415037989616394 }, { "epoch": 0.4849713268736405, "step": 4905, "train/sim_loss": 0.01171875 }, { "epoch": 0.4849713268736405, "step": 4905, "train/total_loss": 0.045869130641222 }, { "entropy": 9.029336929321289, "epoch": 0.485070199723156, "mean_token_accuracy": 0.7608951926231384, "num_tokens": 26964514.0, "step": 4906, "train/ce_loss": 0.7507150173187256 }, { "epoch": 0.485070199723156, "step": 4906, "train/sim_loss": 0.0625 }, { "epoch": 0.485070199723156, "step": 4906, "train/total_loss": 0.1375715136528015 }, { "entropy": 8.817264556884766, "epoch": 0.48516907257267156, "mean_token_accuracy": 0.8036677241325378, "num_tokens": 26970018.0, "step": 4907, "train/ce_loss": 0.633598268032074 }, { "epoch": 0.48516907257267156, "step": 4907, "train/sim_loss": 0.0859375 }, { "epoch": 0.48516907257267156, "step": 4907, "train/total_loss": 0.1492973268032074 }, { "entropy": 8.799577713012695, "epoch": 0.48526794542218704, "mean_token_accuracy": 0.7821655869483948, "num_tokens": 26975489.0, "step": 4908, "train/ce_loss": 0.5232946872711182 }, { "epoch": 0.48526794542218704, "step": 4908, "train/sim_loss": 0.0234375 }, { "epoch": 0.48526794542218704, "step": 4908, "train/total_loss": 0.07576696574687958 }, { "entropy": 8.636083602905273, "epoch": 0.4853668182717026, "mean_token_accuracy": 0.7368941903114319, "num_tokens": 26981182.0, "step": 4909, "train/ce_loss": 0.7105228900909424 }, { "epoch": 0.4853668182717026, "step": 4909, "train/sim_loss": 0.0390625 }, { "epoch": 0.4853668182717026, "step": 4909, "train/total_loss": 0.11011479049921036 }, { "entropy": 8.977350234985352, "epoch": 0.4854656911212181, "mean_token_accuracy": 0.7557252049446106, "num_tokens": 26986725.0, "step": 4910, "train/ce_loss": 0.5901734828948975 }, { "epoch": 0.4854656911212181, "step": 4910, "train/sim_loss": 0.046875 }, { "epoch": 0.4854656911212181, "step": 4910, "train/total_loss": 0.10589234530925751 }, { "entropy": 9.112669944763184, "epoch": 0.4855645639707336, "mean_token_accuracy": 0.7571022510528564, "num_tokens": 26991922.0, "step": 4911, "train/ce_loss": 0.724094033241272 }, { "epoch": 0.4855645639707336, "step": 4911, "train/sim_loss": 0.06640625 }, { "epoch": 0.4855645639707336, "step": 4911, "train/total_loss": 0.13881565630435944 }, { "entropy": 9.036425590515137, "epoch": 0.48566343682024915, "mean_token_accuracy": 0.7830423712730408, "num_tokens": 26997312.0, "step": 4912, "train/ce_loss": 0.5476614832878113 }, { "epoch": 0.48566343682024915, "step": 4912, "train/sim_loss": 0.01953125 }, { "epoch": 0.48566343682024915, "step": 4912, "train/total_loss": 0.07429739832878113 }, { "entropy": 8.985062599182129, "epoch": 0.4857623096697647, "mean_token_accuracy": 0.7512500286102295, "num_tokens": 27002745.0, "step": 4913, "train/ce_loss": 0.6853305697441101 }, { "epoch": 0.4857623096697647, "step": 4913, "train/sim_loss": 0.0625 }, { "epoch": 0.4857623096697647, "step": 4913, "train/total_loss": 0.1310330629348755 }, { "entropy": 8.711065292358398, "epoch": 0.48586118251928023, "mean_token_accuracy": 0.7563959956169128, "num_tokens": 27008253.0, "step": 4914, "train/ce_loss": 0.5171759724617004 }, { "epoch": 0.48586118251928023, "step": 4914, "train/sim_loss": 0.0234375 }, { "epoch": 0.48586118251928023, "step": 4914, "train/total_loss": 0.0751550942659378 }, { "entropy": 8.84826946258545, "epoch": 0.4859600553687957, "mean_token_accuracy": 0.7594936490058899, "num_tokens": 27013612.0, "step": 4915, "train/ce_loss": 0.7613685727119446 }, { "epoch": 0.4859600553687957, "step": 4915, "train/sim_loss": 0.05078125 }, { "epoch": 0.4859600553687957, "step": 4915, "train/total_loss": 0.12691810727119446 }, { "entropy": 8.7720947265625, "epoch": 0.48605892821831126, "mean_token_accuracy": 0.7145969271659851, "num_tokens": 27019096.0, "step": 4916, "train/ce_loss": 1.1455674171447754 }, { "epoch": 0.48605892821831126, "step": 4916, "train/sim_loss": 0.0625 }, { "epoch": 0.48605892821831126, "step": 4916, "train/total_loss": 0.17705674469470978 }, { "entropy": 9.316564559936523, "epoch": 0.4861578010678268, "mean_token_accuracy": 0.7350689172744751, "num_tokens": 27024303.0, "step": 4917, "train/ce_loss": 0.6632058620452881 }, { "epoch": 0.4861578010678268, "step": 4917, "train/sim_loss": 0.0546875 }, { "epoch": 0.4861578010678268, "step": 4917, "train/total_loss": 0.12100809067487717 }, { "entropy": 9.079893112182617, "epoch": 0.4862566739173423, "mean_token_accuracy": 0.7539682388305664, "num_tokens": 27029611.0, "step": 4918, "train/ce_loss": 0.584978461265564 }, { "epoch": 0.4862566739173423, "step": 4918, "train/sim_loss": 0.05078125 }, { "epoch": 0.4862566739173423, "step": 4918, "train/total_loss": 0.1092790961265564 }, { "entropy": 8.90101432800293, "epoch": 0.4863555467668578, "mean_token_accuracy": 0.7494226098060608, "num_tokens": 27035055.0, "step": 4919, "train/ce_loss": 1.0295590162277222 }, { "epoch": 0.4863555467668578, "step": 4919, "train/sim_loss": 0.05078125 }, { "epoch": 0.4863555467668578, "step": 4919, "train/total_loss": 0.1537371575832367 }, { "epoch": 0.48645441961637337, "grad_norm": 0.7031439542770386, "learning_rate": 8.78628294516145e-06, "loss": 0.1414, "step": 4920 }, { "entropy": 9.266952514648438, "epoch": 0.48645441961637337, "mean_token_accuracy": 0.7776333093643188, "num_tokens": 27040453.0, "step": 4920, "train/ce_loss": 0.5505962371826172 }, { "epoch": 0.48645441961637337, "step": 4920, "train/sim_loss": 0.0625 }, { "epoch": 0.48645441961637337, "step": 4920, "train/total_loss": 0.11755962669849396 }, { "entropy": 8.873427391052246, "epoch": 0.48655329246588885, "mean_token_accuracy": 0.7961783409118652, "num_tokens": 27046019.0, "step": 4921, "train/ce_loss": 0.8268890976905823 }, { "epoch": 0.48655329246588885, "step": 4921, "train/sim_loss": 0.046875 }, { "epoch": 0.48655329246588885, "step": 4921, "train/total_loss": 0.12956391274929047 }, { "entropy": 8.831591606140137, "epoch": 0.4866521653154044, "mean_token_accuracy": 0.7442371249198914, "num_tokens": 27051586.0, "step": 4922, "train/ce_loss": 0.827384352684021 }, { "epoch": 0.4866521653154044, "step": 4922, "train/sim_loss": 0.06640625 }, { "epoch": 0.4866521653154044, "step": 4922, "train/total_loss": 0.14914467930793762 }, { "entropy": 8.936354637145996, "epoch": 0.48675103816491994, "mean_token_accuracy": 0.7327394485473633, "num_tokens": 27057062.0, "step": 4923, "train/ce_loss": 0.838235080242157 }, { "epoch": 0.48675103816491994, "step": 4923, "train/sim_loss": 0.03125 }, { "epoch": 0.48675103816491994, "step": 4923, "train/total_loss": 0.11507350951433182 }, { "entropy": 8.730464935302734, "epoch": 0.4868499110144354, "mean_token_accuracy": 0.7338629364967346, "num_tokens": 27062700.0, "step": 4924, "train/ce_loss": 0.41562461853027344 }, { "epoch": 0.4868499110144354, "step": 4924, "train/sim_loss": 0.078125 }, { "epoch": 0.4868499110144354, "step": 4924, "train/total_loss": 0.11968746781349182 }, { "entropy": 9.049154281616211, "epoch": 0.48694878386395096, "mean_token_accuracy": 0.7073760628700256, "num_tokens": 27068097.0, "step": 4925, "train/ce_loss": 0.776642918586731 }, { "epoch": 0.48694878386395096, "step": 4925, "train/sim_loss": 0.0625 }, { "epoch": 0.48694878386395096, "step": 4925, "train/total_loss": 0.14016428589820862 }, { "entropy": 8.544002532958984, "epoch": 0.4870476567134665, "mean_token_accuracy": 0.7357075810432434, "num_tokens": 27073733.0, "step": 4926, "train/ce_loss": 1.6965383291244507 }, { "epoch": 0.4870476567134665, "step": 4926, "train/sim_loss": 0.046875 }, { "epoch": 0.4870476567134665, "step": 4926, "train/total_loss": 0.21652883291244507 }, { "entropy": 9.181346893310547, "epoch": 0.487146529562982, "mean_token_accuracy": 0.7997416257858276, "num_tokens": 27079116.0, "step": 4927, "train/ce_loss": 0.6368092894554138 }, { "epoch": 0.487146529562982, "step": 4927, "train/sim_loss": 0.0390625 }, { "epoch": 0.487146529562982, "step": 4927, "train/total_loss": 0.10274343192577362 }, { "entropy": 8.919554710388184, "epoch": 0.48724540241249753, "mean_token_accuracy": 0.7540574073791504, "num_tokens": 27084576.0, "step": 4928, "train/ce_loss": 1.0307215452194214 }, { "epoch": 0.48724540241249753, "step": 4928, "train/sim_loss": 0.10546875 }, { "epoch": 0.48724540241249753, "step": 4928, "train/total_loss": 0.2085409164428711 }, { "entropy": 8.862112998962402, "epoch": 0.48734427526201307, "mean_token_accuracy": 0.7822839021682739, "num_tokens": 27090080.0, "step": 4929, "train/ce_loss": 0.6907874941825867 }, { "epoch": 0.48734427526201307, "step": 4929, "train/sim_loss": 0.0546875 }, { "epoch": 0.48734427526201307, "step": 4929, "train/total_loss": 0.12376625090837479 }, { "entropy": 8.997406005859375, "epoch": 0.48744314811152856, "mean_token_accuracy": 0.7610965967178345, "num_tokens": 27095550.0, "step": 4930, "train/ce_loss": 0.9153851866722107 }, { "epoch": 0.48744314811152856, "step": 4930, "train/sim_loss": 0.03125 }, { "epoch": 0.48744314811152856, "step": 4930, "train/total_loss": 0.12278851866722107 }, { "entropy": 8.795178413391113, "epoch": 0.4875420209610441, "mean_token_accuracy": 0.6935483813285828, "num_tokens": 27100987.0, "step": 4931, "train/ce_loss": 0.7868033647537231 }, { "epoch": 0.4875420209610441, "step": 4931, "train/sim_loss": 0.05859375 }, { "epoch": 0.4875420209610441, "step": 4931, "train/total_loss": 0.13727408647537231 }, { "entropy": 9.175443649291992, "epoch": 0.48764089381055964, "mean_token_accuracy": 0.7326478362083435, "num_tokens": 27106393.0, "step": 4932, "train/ce_loss": 0.816450834274292 }, { "epoch": 0.48764089381055964, "step": 4932, "train/sim_loss": 0.0859375 }, { "epoch": 0.48764089381055964, "step": 4932, "train/total_loss": 0.16758258640766144 }, { "entropy": 8.84401798248291, "epoch": 0.4877397666600751, "mean_token_accuracy": 0.7792349457740784, "num_tokens": 27111932.0, "step": 4933, "train/ce_loss": 0.5747295618057251 }, { "epoch": 0.4877397666600751, "step": 4933, "train/sim_loss": 0.0390625 }, { "epoch": 0.4877397666600751, "step": 4933, "train/total_loss": 0.09653545916080475 }, { "entropy": 8.446155548095703, "epoch": 0.48783863950959067, "mean_token_accuracy": 0.6553945541381836, "num_tokens": 27117757.0, "step": 4934, "train/ce_loss": 1.9062564373016357 }, { "epoch": 0.48783863950959067, "step": 4934, "train/sim_loss": 0.078125 }, { "epoch": 0.48783863950959067, "step": 4934, "train/total_loss": 0.2687506675720215 }, { "entropy": 9.093807220458984, "epoch": 0.4879375123591062, "mean_token_accuracy": 0.7506361603736877, "num_tokens": 27123186.0, "step": 4935, "train/ce_loss": 1.0247273445129395 }, { "epoch": 0.4879375123591062, "step": 4935, "train/sim_loss": 0.125 }, { "epoch": 0.4879375123591062, "step": 4935, "train/total_loss": 0.22747273743152618 }, { "entropy": 8.687833786010742, "epoch": 0.4880363852086217, "mean_token_accuracy": 0.7093750238418579, "num_tokens": 27128792.0, "step": 4936, "train/ce_loss": 0.975292444229126 }, { "epoch": 0.4880363852086217, "step": 4936, "train/sim_loss": 0.05859375 }, { "epoch": 0.4880363852086217, "step": 4936, "train/total_loss": 0.15612299740314484 }, { "entropy": 8.879434585571289, "epoch": 0.48813525805813723, "mean_token_accuracy": 0.7490683197975159, "num_tokens": 27134107.0, "step": 4937, "train/ce_loss": 0.916359543800354 }, { "epoch": 0.48813525805813723, "step": 4937, "train/sim_loss": 0.05859375 }, { "epoch": 0.48813525805813723, "step": 4937, "train/total_loss": 0.15022970736026764 }, { "entropy": 9.121956825256348, "epoch": 0.4882341309076528, "mean_token_accuracy": 0.7319034934043884, "num_tokens": 27139474.0, "step": 4938, "train/ce_loss": 0.789492666721344 }, { "epoch": 0.4882341309076528, "step": 4938, "train/sim_loss": 0.0546875 }, { "epoch": 0.4882341309076528, "step": 4938, "train/total_loss": 0.13363677263259888 }, { "entropy": 8.96034049987793, "epoch": 0.48833300375716826, "mean_token_accuracy": 0.77320796251297, "num_tokens": 27145019.0, "step": 4939, "train/ce_loss": 0.505705714225769 }, { "epoch": 0.48833300375716826, "step": 4939, "train/sim_loss": 0.046875 }, { "epoch": 0.48833300375716826, "step": 4939, "train/total_loss": 0.09744557738304138 }, { "epoch": 0.4884318766066838, "grad_norm": 0.8071562051773071, "learning_rate": 8.781338080403502e-06, "loss": 0.1435, "step": 4940 }, { "entropy": 8.535490036010742, "epoch": 0.4884318766066838, "mean_token_accuracy": 0.718324601650238, "num_tokens": 27150622.0, "step": 4940, "train/ce_loss": 1.519147276878357 }, { "epoch": 0.4884318766066838, "step": 4940, "train/sim_loss": 0.06640625 }, { "epoch": 0.4884318766066838, "step": 4940, "train/total_loss": 0.21832098066806793 }, { "entropy": 9.079718589782715, "epoch": 0.48853074945619934, "mean_token_accuracy": 0.7342932224273682, "num_tokens": 27156018.0, "step": 4941, "train/ce_loss": 0.8063054084777832 }, { "epoch": 0.48853074945619934, "step": 4941, "train/sim_loss": 0.0703125 }, { "epoch": 0.48853074945619934, "step": 4941, "train/total_loss": 0.15094304084777832 }, { "entropy": 8.708730697631836, "epoch": 0.48862962230571483, "mean_token_accuracy": 0.7379733920097351, "num_tokens": 27161536.0, "step": 4942, "train/ce_loss": 0.7309055328369141 }, { "epoch": 0.48862962230571483, "step": 4942, "train/sim_loss": 0.0390625 }, { "epoch": 0.48862962230571483, "step": 4942, "train/total_loss": 0.1121530532836914 }, { "entropy": 8.726943969726562, "epoch": 0.48872849515523037, "mean_token_accuracy": 0.8026461005210876, "num_tokens": 27167081.0, "step": 4943, "train/ce_loss": 0.8508040904998779 }, { "epoch": 0.48872849515523037, "step": 4943, "train/sim_loss": 0.078125 }, { "epoch": 0.48872849515523037, "step": 4943, "train/total_loss": 0.16320541501045227 }, { "entropy": 8.89500617980957, "epoch": 0.4888273680047459, "mean_token_accuracy": 0.7411225438117981, "num_tokens": 27172460.0, "step": 4944, "train/ce_loss": 0.7238611578941345 }, { "epoch": 0.4888273680047459, "step": 4944, "train/sim_loss": 0.05078125 }, { "epoch": 0.4888273680047459, "step": 4944, "train/total_loss": 0.12316736578941345 }, { "entropy": 8.987115859985352, "epoch": 0.4889262408542614, "mean_token_accuracy": 0.7422934770584106, "num_tokens": 27177893.0, "step": 4945, "train/ce_loss": 0.7193001508712769 }, { "epoch": 0.4889262408542614, "step": 4945, "train/sim_loss": 0.015625 }, { "epoch": 0.4889262408542614, "step": 4945, "train/total_loss": 0.08755501359701157 }, { "entropy": 8.854781150817871, "epoch": 0.48902511370377694, "mean_token_accuracy": 0.7766990065574646, "num_tokens": 27183515.0, "step": 4946, "train/ce_loss": 0.524625837802887 }, { "epoch": 0.48902511370377694, "step": 4946, "train/sim_loss": 0.03515625 }, { "epoch": 0.48902511370377694, "step": 4946, "train/total_loss": 0.08761883527040482 }, { "entropy": 8.672306060791016, "epoch": 0.4891239865532925, "mean_token_accuracy": 0.7154724597930908, "num_tokens": 27188988.0, "step": 4947, "train/ce_loss": 0.9296212792396545 }, { "epoch": 0.4891239865532925, "step": 4947, "train/sim_loss": 0.0625 }, { "epoch": 0.4891239865532925, "step": 4947, "train/total_loss": 0.1554621309041977 }, { "entropy": 9.004825592041016, "epoch": 0.48922285940280796, "mean_token_accuracy": 0.7622950673103333, "num_tokens": 27194353.0, "step": 4948, "train/ce_loss": 0.4180926978588104 }, { "epoch": 0.48922285940280796, "step": 4948, "train/sim_loss": 0.0390625 }, { "epoch": 0.48922285940280796, "step": 4948, "train/total_loss": 0.08087177574634552 }, { "entropy": 9.038949012756348, "epoch": 0.4893217322523235, "mean_token_accuracy": 0.774032473564148, "num_tokens": 27199787.0, "step": 4949, "train/ce_loss": 1.008306622505188 }, { "epoch": 0.4893217322523235, "step": 4949, "train/sim_loss": 0.0703125 }, { "epoch": 0.4893217322523235, "step": 4949, "train/total_loss": 0.17114317417144775 }, { "entropy": 9.2123441696167, "epoch": 0.48942060510183905, "mean_token_accuracy": 0.7483617067337036, "num_tokens": 27205131.0, "step": 4950, "train/ce_loss": 0.680835485458374 }, { "epoch": 0.48942060510183905, "step": 4950, "train/sim_loss": 0.06640625 }, { "epoch": 0.48942060510183905, "step": 4950, "train/total_loss": 0.13448980450630188 }, { "entropy": 9.112770080566406, "epoch": 0.48951947795135453, "mean_token_accuracy": 0.7698309421539307, "num_tokens": 27210454.0, "step": 4951, "train/ce_loss": 0.5641814470291138 }, { "epoch": 0.48951947795135453, "step": 4951, "train/sim_loss": 0.0234375 }, { "epoch": 0.48951947795135453, "step": 4951, "train/total_loss": 0.07985565066337585 }, { "entropy": 9.219940185546875, "epoch": 0.4896183508008701, "mean_token_accuracy": 0.7270440459251404, "num_tokens": 27215879.0, "step": 4952, "train/ce_loss": 1.067116141319275 }, { "epoch": 0.4896183508008701, "step": 4952, "train/sim_loss": 0.0390625 }, { "epoch": 0.4896183508008701, "step": 4952, "train/total_loss": 0.14577412605285645 }, { "entropy": 8.947468757629395, "epoch": 0.4897172236503856, "mean_token_accuracy": 0.7601918578147888, "num_tokens": 27221234.0, "step": 4953, "train/ce_loss": 0.4873523414134979 }, { "epoch": 0.4897172236503856, "step": 4953, "train/sim_loss": 0.0234375 }, { "epoch": 0.4897172236503856, "step": 4953, "train/total_loss": 0.07217273116111755 }, { "entropy": 8.812744140625, "epoch": 0.48981609649990115, "mean_token_accuracy": 0.765625, "num_tokens": 27226762.0, "step": 4954, "train/ce_loss": 1.0018863677978516 }, { "epoch": 0.48981609649990115, "step": 4954, "train/sim_loss": 0.0546875 }, { "epoch": 0.48981609649990115, "step": 4954, "train/total_loss": 0.15487614274024963 }, { "entropy": 8.726654052734375, "epoch": 0.48991496934941664, "mean_token_accuracy": 0.7273603081703186, "num_tokens": 27232472.0, "step": 4955, "train/ce_loss": 0.9766483306884766 }, { "epoch": 0.48991496934941664, "step": 4955, "train/sim_loss": 0.05078125 }, { "epoch": 0.48991496934941664, "step": 4955, "train/total_loss": 0.14844608306884766 }, { "entropy": 8.971062660217285, "epoch": 0.4900138421989322, "mean_token_accuracy": 0.7445783019065857, "num_tokens": 27237859.0, "step": 4956, "train/ce_loss": 0.94769287109375 }, { "epoch": 0.4900138421989322, "step": 4956, "train/sim_loss": 0.046875 }, { "epoch": 0.4900138421989322, "step": 4956, "train/total_loss": 0.14164429903030396 }, { "entropy": 9.347953796386719, "epoch": 0.4901127150484477, "mean_token_accuracy": 0.7037037014961243, "num_tokens": 27243069.0, "step": 4957, "train/ce_loss": 0.8488903045654297 }, { "epoch": 0.4901127150484477, "step": 4957, "train/sim_loss": 0.046875 }, { "epoch": 0.4901127150484477, "step": 4957, "train/total_loss": 0.1317640244960785 }, { "entropy": 8.950490951538086, "epoch": 0.4902115878979632, "mean_token_accuracy": 0.7308120131492615, "num_tokens": 27248647.0, "step": 4958, "train/ce_loss": 0.7352393865585327 }, { "epoch": 0.4902115878979632, "step": 4958, "train/sim_loss": 0.05859375 }, { "epoch": 0.4902115878979632, "step": 4958, "train/total_loss": 0.13211768865585327 }, { "entropy": 8.814451217651367, "epoch": 0.49031046074747875, "mean_token_accuracy": 0.7530726194381714, "num_tokens": 27254220.0, "step": 4959, "train/ce_loss": 0.4250522255897522 }, { "epoch": 0.49031046074747875, "step": 4959, "train/sim_loss": 0.046875 }, { "epoch": 0.49031046074747875, "step": 4959, "train/total_loss": 0.08938021957874298 }, { "epoch": 0.4904093335969943, "grad_norm": 0.6640818119049072, "learning_rate": 8.776393215645553e-06, "loss": 0.1389, "step": 4960 }, { "entropy": 9.400506019592285, "epoch": 0.4904093335969943, "mean_token_accuracy": 0.7233009934425354, "num_tokens": 27259306.0, "step": 4960, "train/ce_loss": 0.4392494857311249 }, { "epoch": 0.4904093335969943, "step": 4960, "train/sim_loss": 0.05859375 }, { "epoch": 0.4904093335969943, "step": 4960, "train/total_loss": 0.10251870006322861 }, { "entropy": 8.988218307495117, "epoch": 0.4905082064465098, "mean_token_accuracy": 0.7743467688560486, "num_tokens": 27264696.0, "step": 4961, "train/ce_loss": 0.7026248574256897 }, { "epoch": 0.4905082064465098, "step": 4961, "train/sim_loss": 0.03125 }, { "epoch": 0.4905082064465098, "step": 4961, "train/total_loss": 0.10151248425245285 }, { "entropy": 9.249125480651855, "epoch": 0.4906070792960253, "mean_token_accuracy": 0.7624161243438721, "num_tokens": 27270056.0, "step": 4962, "train/ce_loss": 0.5940555930137634 }, { "epoch": 0.4906070792960253, "step": 4962, "train/sim_loss": 0.04296875 }, { "epoch": 0.4906070792960253, "step": 4962, "train/total_loss": 0.10237431526184082 }, { "entropy": 8.736263275146484, "epoch": 0.49070595214554086, "mean_token_accuracy": 0.7502774596214294, "num_tokens": 27275641.0, "step": 4963, "train/ce_loss": 0.6085458397865295 }, { "epoch": 0.49070595214554086, "step": 4963, "train/sim_loss": 0.0234375 }, { "epoch": 0.49070595214554086, "step": 4963, "train/total_loss": 0.08429208397865295 }, { "entropy": 8.95068645477295, "epoch": 0.49080482499505634, "mean_token_accuracy": 0.730140209197998, "num_tokens": 27281001.0, "step": 4964, "train/ce_loss": 0.9438299536705017 }, { "epoch": 0.49080482499505634, "step": 4964, "train/sim_loss": 0.0703125 }, { "epoch": 0.49080482499505634, "step": 4964, "train/total_loss": 0.16469550132751465 }, { "entropy": 8.141060829162598, "epoch": 0.4909036978445719, "mean_token_accuracy": 0.7215657234191895, "num_tokens": 27286961.0, "step": 4965, "train/ce_loss": 0.48852840065956116 }, { "epoch": 0.4909036978445719, "step": 4965, "train/sim_loss": 0.078125 }, { "epoch": 0.4909036978445719, "step": 4965, "train/total_loss": 0.1269778460264206 }, { "entropy": 9.118463516235352, "epoch": 0.4910025706940874, "mean_token_accuracy": 0.7704517841339111, "num_tokens": 27292399.0, "step": 4966, "train/ce_loss": 0.852812647819519 }, { "epoch": 0.4910025706940874, "step": 4966, "train/sim_loss": 0.0625 }, { "epoch": 0.4910025706940874, "step": 4966, "train/total_loss": 0.14778126776218414 }, { "entropy": 9.054220199584961, "epoch": 0.4911014435436029, "mean_token_accuracy": 0.7063007950782776, "num_tokens": 27298039.0, "step": 4967, "train/ce_loss": 1.754141926765442 }, { "epoch": 0.4911014435436029, "step": 4967, "train/sim_loss": 0.171875 }, { "epoch": 0.4911014435436029, "step": 4967, "train/total_loss": 0.34728920459747314 }, { "entropy": 8.418065071105957, "epoch": 0.49120031639311845, "mean_token_accuracy": 0.7805362343788147, "num_tokens": 27303632.0, "step": 4968, "train/ce_loss": 0.591689944267273 }, { "epoch": 0.49120031639311845, "step": 4968, "train/sim_loss": 0.0390625 }, { "epoch": 0.49120031639311845, "step": 4968, "train/total_loss": 0.0982314944267273 }, { "entropy": 9.431183815002441, "epoch": 0.491299189242634, "mean_token_accuracy": 0.800000011920929, "num_tokens": 27308885.0, "step": 4969, "train/ce_loss": 0.5204214453697205 }, { "epoch": 0.491299189242634, "step": 4969, "train/sim_loss": 0.0546875 }, { "epoch": 0.491299189242634, "step": 4969, "train/total_loss": 0.10672964155673981 }, { "entropy": 9.244543075561523, "epoch": 0.4913980620921495, "mean_token_accuracy": 0.7566433548927307, "num_tokens": 27314196.0, "step": 4970, "train/ce_loss": 1.181792974472046 }, { "epoch": 0.4913980620921495, "step": 4970, "train/sim_loss": 0.05078125 }, { "epoch": 0.4913980620921495, "step": 4970, "train/total_loss": 0.1689605414867401 }, { "entropy": 8.921154975891113, "epoch": 0.491496934941665, "mean_token_accuracy": 0.7148803472518921, "num_tokens": 27319747.0, "step": 4971, "train/ce_loss": 0.4875986874103546 }, { "epoch": 0.491496934941665, "step": 4971, "train/sim_loss": 0.01953125 }, { "epoch": 0.491496934941665, "step": 4971, "train/total_loss": 0.06829112023115158 }, { "entropy": 8.593174934387207, "epoch": 0.49159580779118056, "mean_token_accuracy": 0.7288930416107178, "num_tokens": 27325370.0, "step": 4972, "train/ce_loss": 0.5642191767692566 }, { "epoch": 0.49159580779118056, "step": 4972, "train/sim_loss": 0.03515625 }, { "epoch": 0.49159580779118056, "step": 4972, "train/total_loss": 0.0915781706571579 }, { "entropy": 8.907352447509766, "epoch": 0.49169468064069605, "mean_token_accuracy": 0.7196162343025208, "num_tokens": 27330882.0, "step": 4973, "train/ce_loss": 1.384791612625122 }, { "epoch": 0.49169468064069605, "step": 4973, "train/sim_loss": 0.0546875 }, { "epoch": 0.49169468064069605, "step": 4973, "train/total_loss": 0.19316665828227997 }, { "entropy": 9.145120620727539, "epoch": 0.4917935534902116, "mean_token_accuracy": 0.7581453919410706, "num_tokens": 27336242.0, "step": 4974, "train/ce_loss": 0.6992141604423523 }, { "epoch": 0.4917935534902116, "step": 4974, "train/sim_loss": 0.05078125 }, { "epoch": 0.4917935534902116, "step": 4974, "train/total_loss": 0.12070266902446747 }, { "entropy": 8.806787490844727, "epoch": 0.49189242633972713, "mean_token_accuracy": 0.7124541997909546, "num_tokens": 27341860.0, "step": 4975, "train/ce_loss": 1.8370815515518188 }, { "epoch": 0.49189242633972713, "step": 4975, "train/sim_loss": 0.0546875 }, { "epoch": 0.49189242633972713, "step": 4975, "train/total_loss": 0.23839566111564636 }, { "entropy": 8.581661224365234, "epoch": 0.4919912991892426, "mean_token_accuracy": 0.7249334454536438, "num_tokens": 27347596.0, "step": 4976, "train/ce_loss": 0.7783591747283936 }, { "epoch": 0.4919912991892426, "step": 4976, "train/sim_loss": 0.0234375 }, { "epoch": 0.4919912991892426, "step": 4976, "train/total_loss": 0.10127341747283936 }, { "entropy": 9.047252655029297, "epoch": 0.49209017203875816, "mean_token_accuracy": 0.7430107593536377, "num_tokens": 27353147.0, "step": 4977, "train/ce_loss": 1.1469461917877197 }, { "epoch": 0.49209017203875816, "step": 4977, "train/sim_loss": 0.1015625 }, { "epoch": 0.49209017203875816, "step": 4977, "train/total_loss": 0.21625712513923645 }, { "entropy": 9.45518970489502, "epoch": 0.4921890448882737, "mean_token_accuracy": 0.7511811256408691, "num_tokens": 27358353.0, "step": 4978, "train/ce_loss": 1.433322548866272 }, { "epoch": 0.4921890448882737, "step": 4978, "train/sim_loss": 0.078125 }, { "epoch": 0.4921890448882737, "step": 4978, "train/total_loss": 0.22145725786685944 }, { "entropy": 8.765846252441406, "epoch": 0.4922879177377892, "mean_token_accuracy": 0.7598814368247986, "num_tokens": 27363955.0, "step": 4979, "train/ce_loss": 0.8696557283401489 }, { "epoch": 0.4922879177377892, "step": 4979, "train/sim_loss": 0.0703125 }, { "epoch": 0.4922879177377892, "step": 4979, "train/total_loss": 0.15727807581424713 }, { "epoch": 0.4923867905873047, "grad_norm": 0.6652105450630188, "learning_rate": 8.771448350887603e-06, "loss": 0.1404, "step": 4980 }, { "entropy": 9.29217529296875, "epoch": 0.4923867905873047, "mean_token_accuracy": 0.7841269969940186, "num_tokens": 27369216.0, "step": 4980, "train/ce_loss": 0.6019874215126038 }, { "epoch": 0.4923867905873047, "step": 4980, "train/sim_loss": 0.05859375 }, { "epoch": 0.4923867905873047, "step": 4980, "train/total_loss": 0.11879248917102814 }, { "entropy": 8.796005249023438, "epoch": 0.49248566343682026, "mean_token_accuracy": 0.790398120880127, "num_tokens": 27374708.0, "step": 4981, "train/ce_loss": 0.7858574986457825 }, { "epoch": 0.49248566343682026, "step": 4981, "train/sim_loss": 0.05078125 }, { "epoch": 0.49248566343682026, "step": 4981, "train/total_loss": 0.12936699390411377 }, { "entropy": 9.180804252624512, "epoch": 0.49258453628633575, "mean_token_accuracy": 0.7379310131072998, "num_tokens": 27379916.0, "step": 4982, "train/ce_loss": 0.6463871002197266 }, { "epoch": 0.49258453628633575, "step": 4982, "train/sim_loss": 0.03515625 }, { "epoch": 0.49258453628633575, "step": 4982, "train/total_loss": 0.09979496151208878 }, { "entropy": 8.911893844604492, "epoch": 0.4926834091358513, "mean_token_accuracy": 0.7551462650299072, "num_tokens": 27385387.0, "step": 4983, "train/ce_loss": 0.2608502507209778 }, { "epoch": 0.4926834091358513, "step": 4983, "train/sim_loss": 0.02734375 }, { "epoch": 0.4926834091358513, "step": 4983, "train/total_loss": 0.0534287765622139 }, { "entropy": 9.157957077026367, "epoch": 0.49278228198536683, "mean_token_accuracy": 0.7543160915374756, "num_tokens": 27390766.0, "step": 4984, "train/ce_loss": 0.5576865077018738 }, { "epoch": 0.49278228198536683, "step": 4984, "train/sim_loss": 0.0625 }, { "epoch": 0.49278228198536683, "step": 4984, "train/total_loss": 0.11826865375041962 }, { "entropy": 8.928897857666016, "epoch": 0.4928811548348823, "mean_token_accuracy": 0.7628865838050842, "num_tokens": 27396288.0, "step": 4985, "train/ce_loss": 0.5929907560348511 }, { "epoch": 0.4928811548348823, "step": 4985, "train/sim_loss": 0.046875 }, { "epoch": 0.4928811548348823, "step": 4985, "train/total_loss": 0.10617408156394958 }, { "entropy": 9.132997512817383, "epoch": 0.49298002768439786, "mean_token_accuracy": 0.7081660032272339, "num_tokens": 27401644.0, "step": 4986, "train/ce_loss": 1.164886474609375 }, { "epoch": 0.49298002768439786, "step": 4986, "train/sim_loss": 0.09375 }, { "epoch": 0.49298002768439786, "step": 4986, "train/total_loss": 0.21023865044116974 }, { "entropy": 8.833941459655762, "epoch": 0.4930789005339134, "mean_token_accuracy": 0.7400000095367432, "num_tokens": 27407157.0, "step": 4987, "train/ce_loss": 0.7693637013435364 }, { "epoch": 0.4930789005339134, "step": 4987, "train/sim_loss": 0.05078125 }, { "epoch": 0.4930789005339134, "step": 4987, "train/total_loss": 0.12771761417388916 }, { "entropy": 9.43825912475586, "epoch": 0.4931777733834289, "mean_token_accuracy": 0.7081481218338013, "num_tokens": 27412504.0, "step": 4988, "train/ce_loss": 0.7742285132408142 }, { "epoch": 0.4931777733834289, "step": 4988, "train/sim_loss": 0.0859375 }, { "epoch": 0.4931777733834289, "step": 4988, "train/total_loss": 0.1633603572845459 }, { "entropy": 9.227444648742676, "epoch": 0.4932766462329444, "mean_token_accuracy": 0.7359307408332825, "num_tokens": 27417735.0, "step": 4989, "train/ce_loss": 1.1806464195251465 }, { "epoch": 0.4932766462329444, "step": 4989, "train/sim_loss": 0.0625 }, { "epoch": 0.4932766462329444, "step": 4989, "train/total_loss": 0.18056464195251465 }, { "entropy": 8.81460952758789, "epoch": 0.49337551908245997, "mean_token_accuracy": 0.7733026742935181, "num_tokens": 27423213.0, "step": 4990, "train/ce_loss": 0.7380661964416504 }, { "epoch": 0.49337551908245997, "step": 4990, "train/sim_loss": 0.05859375 }, { "epoch": 0.49337551908245997, "step": 4990, "train/total_loss": 0.13240036368370056 }, { "entropy": 8.755910873413086, "epoch": 0.49347439193197545, "mean_token_accuracy": 0.7234469056129456, "num_tokens": 27428782.0, "step": 4991, "train/ce_loss": 0.4885389506816864 }, { "epoch": 0.49347439193197545, "step": 4991, "train/sim_loss": 0.015625 }, { "epoch": 0.49347439193197545, "step": 4991, "train/total_loss": 0.06447889655828476 }, { "entropy": 9.073223114013672, "epoch": 0.493573264781491, "mean_token_accuracy": 0.7198985815048218, "num_tokens": 27434357.0, "step": 4992, "train/ce_loss": 1.2086448669433594 }, { "epoch": 0.493573264781491, "step": 4992, "train/sim_loss": 0.15625 }, { "epoch": 0.493573264781491, "step": 4992, "train/total_loss": 0.27711448073387146 }, { "entropy": 8.862264633178711, "epoch": 0.49367213763100654, "mean_token_accuracy": 0.7673649191856384, "num_tokens": 27439898.0, "step": 4993, "train/ce_loss": 0.6101829409599304 }, { "epoch": 0.49367213763100654, "step": 4993, "train/sim_loss": 0.015625 }, { "epoch": 0.49367213763100654, "step": 4993, "train/total_loss": 0.07664329558610916 }, { "entropy": 8.51704216003418, "epoch": 0.493771010480522, "mean_token_accuracy": 0.7978723645210266, "num_tokens": 27445474.0, "step": 4994, "train/ce_loss": 0.707502007484436 }, { "epoch": 0.493771010480522, "step": 4994, "train/sim_loss": 0.04296875 }, { "epoch": 0.493771010480522, "step": 4994, "train/total_loss": 0.11371894925832748 }, { "entropy": 8.884923934936523, "epoch": 0.49386988333003756, "mean_token_accuracy": 0.7864476442337036, "num_tokens": 27451229.0, "step": 4995, "train/ce_loss": 0.5635467171669006 }, { "epoch": 0.49386988333003756, "step": 4995, "train/sim_loss": 0.0625 }, { "epoch": 0.49386988333003756, "step": 4995, "train/total_loss": 0.11885467171669006 }, { "entropy": 8.984720230102539, "epoch": 0.4939687561795531, "mean_token_accuracy": 0.751207709312439, "num_tokens": 27456707.0, "step": 4996, "train/ce_loss": 0.4359494149684906 }, { "epoch": 0.4939687561795531, "step": 4996, "train/sim_loss": 0.0234375 }, { "epoch": 0.4939687561795531, "step": 4996, "train/total_loss": 0.06703244149684906 }, { "entropy": 9.088752746582031, "epoch": 0.49406762902906864, "mean_token_accuracy": 0.737171471118927, "num_tokens": 27462126.0, "step": 4997, "train/ce_loss": 0.6063069701194763 }, { "epoch": 0.49406762902906864, "step": 4997, "train/sim_loss": 0.046875 }, { "epoch": 0.49406762902906864, "step": 4997, "train/total_loss": 0.1075056940317154 }, { "entropy": 8.815889358520508, "epoch": 0.49416650187858413, "mean_token_accuracy": 0.748083233833313, "num_tokens": 27467721.0, "step": 4998, "train/ce_loss": 0.4824691414833069 }, { "epoch": 0.49416650187858413, "step": 4998, "train/sim_loss": 0.078125 }, { "epoch": 0.49416650187858413, "step": 4998, "train/total_loss": 0.12637192010879517 }, { "entropy": 8.115849494934082, "epoch": 0.49426537472809967, "mean_token_accuracy": 0.7011308670043945, "num_tokens": 27473462.0, "step": 4999, "train/ce_loss": 0.8829972743988037 }, { "epoch": 0.49426537472809967, "step": 4999, "train/sim_loss": 0.05859375 }, { "epoch": 0.49426537472809967, "step": 4999, "train/total_loss": 0.1468934714794159 }, { "epoch": 0.4943642475776152, "grad_norm": 0.7916337847709656, "learning_rate": 8.766503486129655e-06, "loss": 0.1382, "step": 5000 } ], "logging_steps": 20, "max_steps": 40456, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2768154148856136e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }