| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.0, | |
| "eval_steps": 500, | |
| "global_step": 2191, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.098961293697357, | |
| "epoch": 0.032, | |
| "grad_norm": 1.891703486442566, | |
| "learning_rate": 1.9148936170212767e-06, | |
| "loss": 2.0828, | |
| "mean_token_accuracy": 0.530680388212204, | |
| "num_tokens": 72723.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.119775766134262, | |
| "epoch": 0.064, | |
| "grad_norm": 1.2044862508773804, | |
| "learning_rate": 4.042553191489362e-06, | |
| "loss": 2.0093, | |
| "mean_token_accuracy": 0.5355814293026924, | |
| "num_tokens": 146392.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.220579963922501, | |
| "epoch": 0.096, | |
| "grad_norm": 0.9982365369796753, | |
| "learning_rate": 6.170212765957447e-06, | |
| "loss": 1.8939, | |
| "mean_token_accuracy": 0.5451944440603256, | |
| "num_tokens": 223711.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.382017892599106, | |
| "epoch": 0.128, | |
| "grad_norm": 0.7386544346809387, | |
| "learning_rate": 8.297872340425532e-06, | |
| "loss": 1.9066, | |
| "mean_token_accuracy": 0.5411656655371189, | |
| "num_tokens": 300889.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.274736815690994, | |
| "epoch": 0.16, | |
| "grad_norm": 0.6412256956100464, | |
| "learning_rate": 1.0425531914893619e-05, | |
| "loss": 1.7387, | |
| "mean_token_accuracy": 0.5679451540112496, | |
| "num_tokens": 377362.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.3663365960121157, | |
| "epoch": 0.192, | |
| "grad_norm": 0.6228290796279907, | |
| "learning_rate": 1.2553191489361702e-05, | |
| "loss": 1.7492, | |
| "mean_token_accuracy": 0.5746532663702965, | |
| "num_tokens": 449594.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.315044218301773, | |
| "epoch": 0.224, | |
| "grad_norm": 0.6034156680107117, | |
| "learning_rate": 1.4680851063829789e-05, | |
| "loss": 1.7111, | |
| "mean_token_accuracy": 0.5675176709890366, | |
| "num_tokens": 523439.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.288265961408615, | |
| "epoch": 0.256, | |
| "grad_norm": 0.45914268493652344, | |
| "learning_rate": 1.6808510638297873e-05, | |
| "loss": 1.6931, | |
| "mean_token_accuracy": 0.5713589735329151, | |
| "num_tokens": 599650.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 2.2693382859230042, | |
| "epoch": 0.288, | |
| "grad_norm": 0.6197793483734131, | |
| "learning_rate": 1.893617021276596e-05, | |
| "loss": 1.6542, | |
| "mean_token_accuracy": 0.578165066242218, | |
| "num_tokens": 675377.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 2.293796479701996, | |
| "epoch": 0.32, | |
| "grad_norm": 0.5502006411552429, | |
| "learning_rate": 1.9999866154043656e-05, | |
| "loss": 1.7108, | |
| "mean_token_accuracy": 0.5681634023785591, | |
| "num_tokens": 751838.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 2.2658903509378434, | |
| "epoch": 0.352, | |
| "grad_norm": 0.5713317394256592, | |
| "learning_rate": 1.9998795407890486e-05, | |
| "loss": 1.6168, | |
| "mean_token_accuracy": 0.5843982398509979, | |
| "num_tokens": 825539.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 2.270280033349991, | |
| "epoch": 0.384, | |
| "grad_norm": 0.5967482924461365, | |
| "learning_rate": 1.999665403023542e-05, | |
| "loss": 1.6194, | |
| "mean_token_accuracy": 0.5839526921510696, | |
| "num_tokens": 897258.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 2.2349284648895265, | |
| "epoch": 0.416, | |
| "grad_norm": 0.4899630844593048, | |
| "learning_rate": 1.9993442250368708e-05, | |
| "loss": 1.6313, | |
| "mean_token_accuracy": 0.5815729826688767, | |
| "num_tokens": 973142.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 2.245553806424141, | |
| "epoch": 0.448, | |
| "grad_norm": 0.6546034812927246, | |
| "learning_rate": 1.9989160412195047e-05, | |
| "loss": 1.6395, | |
| "mean_token_accuracy": 0.5780692532658577, | |
| "num_tokens": 1046762.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 2.288555932044983, | |
| "epoch": 0.48, | |
| "grad_norm": 0.5528404116630554, | |
| "learning_rate": 1.9983808974196752e-05, | |
| "loss": 1.7118, | |
| "mean_token_accuracy": 0.5686657652258873, | |
| "num_tokens": 1125167.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 2.232080355286598, | |
| "epoch": 0.512, | |
| "grad_norm": 0.5887461304664612, | |
| "learning_rate": 1.9977388509384656e-05, | |
| "loss": 1.6339, | |
| "mean_token_accuracy": 0.5838325396180153, | |
| "num_tokens": 1199589.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 2.2232475757598875, | |
| "epoch": 0.544, | |
| "grad_norm": 0.5764511823654175, | |
| "learning_rate": 1.9969899705236763e-05, | |
| "loss": 1.6173, | |
| "mean_token_accuracy": 0.5848860442638397, | |
| "num_tokens": 1276431.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 2.244092071056366, | |
| "epoch": 0.576, | |
| "grad_norm": 0.6295827627182007, | |
| "learning_rate": 1.9961343363624626e-05, | |
| "loss": 1.6017, | |
| "mean_token_accuracy": 0.5818701103329659, | |
| "num_tokens": 1350012.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 2.237305074930191, | |
| "epoch": 0.608, | |
| "grad_norm": 0.5939638018608093, | |
| "learning_rate": 1.9951720400727495e-05, | |
| "loss": 1.6704, | |
| "mean_token_accuracy": 0.5779796853661537, | |
| "num_tokens": 1423391.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 2.211505854129791, | |
| "epoch": 0.64, | |
| "grad_norm": 0.6119778156280518, | |
| "learning_rate": 1.9941031846934213e-05, | |
| "loss": 1.6223, | |
| "mean_token_accuracy": 0.5848233133554459, | |
| "num_tokens": 1499124.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 2.2195493161678312, | |
| "epoch": 0.672, | |
| "grad_norm": 0.6129831671714783, | |
| "learning_rate": 1.9929278846732883e-05, | |
| "loss": 1.5886, | |
| "mean_token_accuracy": 0.5897421136498451, | |
| "num_tokens": 1573541.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 2.2096123576164244, | |
| "epoch": 0.704, | |
| "grad_norm": 0.6091306209564209, | |
| "learning_rate": 1.9916462658588328e-05, | |
| "loss": 1.6031, | |
| "mean_token_accuracy": 0.5894487425684929, | |
| "num_tokens": 1649546.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 2.249841979146004, | |
| "epoch": 0.736, | |
| "grad_norm": 0.570816695690155, | |
| "learning_rate": 1.9902584654807325e-05, | |
| "loss": 1.5876, | |
| "mean_token_accuracy": 0.5911228567361831, | |
| "num_tokens": 1722199.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 2.1915894985198974, | |
| "epoch": 0.768, | |
| "grad_norm": 0.5748864412307739, | |
| "learning_rate": 1.988764632139168e-05, | |
| "loss": 1.5963, | |
| "mean_token_accuracy": 0.5891387596726417, | |
| "num_tokens": 1797304.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 2.2358563423156737, | |
| "epoch": 0.8, | |
| "grad_norm": 0.6511492729187012, | |
| "learning_rate": 1.9871649257879115e-05, | |
| "loss": 1.6453, | |
| "mean_token_accuracy": 0.5792816638946533, | |
| "num_tokens": 1870113.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 2.2169984579086304, | |
| "epoch": 0.832, | |
| "grad_norm": 0.5317641496658325, | |
| "learning_rate": 1.9854595177171968e-05, | |
| "loss": 1.6594, | |
| "mean_token_accuracy": 0.577045065164566, | |
| "num_tokens": 1947405.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 2.2434292674064635, | |
| "epoch": 0.864, | |
| "grad_norm": 0.5399971604347229, | |
| "learning_rate": 1.9836485905353823e-05, | |
| "loss": 1.7158, | |
| "mean_token_accuracy": 0.5683416239917278, | |
| "num_tokens": 2026284.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 2.227828550338745, | |
| "epoch": 0.896, | |
| "grad_norm": 0.5378643870353699, | |
| "learning_rate": 1.9817323381493933e-05, | |
| "loss": 1.6714, | |
| "mean_token_accuracy": 0.5818367518484593, | |
| "num_tokens": 2103986.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 2.2110894501209257, | |
| "epoch": 0.928, | |
| "grad_norm": 0.5195969343185425, | |
| "learning_rate": 1.979710965743964e-05, | |
| "loss": 1.6239, | |
| "mean_token_accuracy": 0.5819958478212357, | |
| "num_tokens": 2177010.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 2.1666628658771514, | |
| "epoch": 0.96, | |
| "grad_norm": 0.5663164258003235, | |
| "learning_rate": 1.977584689759664e-05, | |
| "loss": 1.6024, | |
| "mean_token_accuracy": 0.5876665830612182, | |
| "num_tokens": 2251285.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 2.214203083515167, | |
| "epoch": 0.992, | |
| "grad_norm": 0.6764860153198242, | |
| "learning_rate": 1.9753537378697237e-05, | |
| "loss": 1.6446, | |
| "mean_token_accuracy": 0.5818003416061401, | |
| "num_tokens": 2325752.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 2.16783396821273, | |
| "epoch": 1.0224, | |
| "grad_norm": 0.5795008540153503, | |
| "learning_rate": 1.9730183489556563e-05, | |
| "loss": 1.594, | |
| "mean_token_accuracy": 0.5867547392845154, | |
| "num_tokens": 2396254.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 2.172953352332115, | |
| "epoch": 1.0544, | |
| "grad_norm": 0.6686444282531738, | |
| "learning_rate": 1.9705787730816776e-05, | |
| "loss": 1.613, | |
| "mean_token_accuracy": 0.5906373374164104, | |
| "num_tokens": 2470123.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 2.2217346757650374, | |
| "epoch": 1.0864, | |
| "grad_norm": 0.6389091610908508, | |
| "learning_rate": 1.9680352714679324e-05, | |
| "loss": 1.7053, | |
| "mean_token_accuracy": 0.577599074691534, | |
| "num_tokens": 2545749.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 2.138428696990013, | |
| "epoch": 1.1184, | |
| "grad_norm": 0.7369883060455322, | |
| "learning_rate": 1.9653881164625234e-05, | |
| "loss": 1.5599, | |
| "mean_token_accuracy": 0.5946489304304123, | |
| "num_tokens": 2623270.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 2.147254040837288, | |
| "epoch": 1.1504, | |
| "grad_norm": 0.6707085967063904, | |
| "learning_rate": 1.9626375915123473e-05, | |
| "loss": 1.5843, | |
| "mean_token_accuracy": 0.5965728983283043, | |
| "num_tokens": 2697616.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 2.1412769109010696, | |
| "epoch": 1.1824, | |
| "grad_norm": 0.7201400995254517, | |
| "learning_rate": 1.9597839911327475e-05, | |
| "loss": 1.58, | |
| "mean_token_accuracy": 0.5957784004509449, | |
| "num_tokens": 2771426.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 2.164059528708458, | |
| "epoch": 1.2144, | |
| "grad_norm": 0.7561144232749939, | |
| "learning_rate": 1.9568276208759772e-05, | |
| "loss": 1.5872, | |
| "mean_token_accuracy": 0.5874110117554665, | |
| "num_tokens": 2846711.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 2.205427420139313, | |
| "epoch": 1.2464, | |
| "grad_norm": 0.691585898399353, | |
| "learning_rate": 1.9537687972984804e-05, | |
| "loss": 1.625, | |
| "mean_token_accuracy": 0.5892911069095135, | |
| "num_tokens": 2920916.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 2.1242104679346085, | |
| "epoch": 1.2784, | |
| "grad_norm": 0.6999676823616028, | |
| "learning_rate": 1.950607847926999e-05, | |
| "loss": 1.5606, | |
| "mean_token_accuracy": 0.5917269751429558, | |
| "num_tokens": 2996056.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 2.114223065972328, | |
| "epoch": 1.3104, | |
| "grad_norm": 0.7616406679153442, | |
| "learning_rate": 1.947345111223502e-05, | |
| "loss": 1.5296, | |
| "mean_token_accuracy": 0.5938275754451752, | |
| "num_tokens": 3072912.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 2.1418962299823763, | |
| "epoch": 1.3424, | |
| "grad_norm": 0.7253025770187378, | |
| "learning_rate": 1.943980936548942e-05, | |
| "loss": 1.575, | |
| "mean_token_accuracy": 0.5945621818304062, | |
| "num_tokens": 3148498.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 2.109667718410492, | |
| "epoch": 1.3744, | |
| "grad_norm": 0.8988682627677917, | |
| "learning_rate": 1.9405156841258498e-05, | |
| "loss": 1.5796, | |
| "mean_token_accuracy": 0.5901263400912284, | |
| "num_tokens": 3224741.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 2.179358023405075, | |
| "epoch": 1.4064, | |
| "grad_norm": 0.741558849811554, | |
| "learning_rate": 1.936949724999762e-05, | |
| "loss": 1.6507, | |
| "mean_token_accuracy": 0.581992793083191, | |
| "num_tokens": 3299366.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 2.1574251472949983, | |
| "epoch": 1.4384000000000001, | |
| "grad_norm": 0.7538727521896362, | |
| "learning_rate": 1.9332834409994906e-05, | |
| "loss": 1.5771, | |
| "mean_token_accuracy": 0.5888051658868789, | |
| "num_tokens": 3374162.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 2.1186763852834702, | |
| "epoch": 1.4704, | |
| "grad_norm": 0.7905173301696777, | |
| "learning_rate": 1.929517224696239e-05, | |
| "loss": 1.6138, | |
| "mean_token_accuracy": 0.584889967739582, | |
| "num_tokens": 3452582.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 2.1135365635156633, | |
| "epoch": 1.5024, | |
| "grad_norm": 0.7416484951972961, | |
| "learning_rate": 1.9256514793615674e-05, | |
| "loss": 1.5623, | |
| "mean_token_accuracy": 0.5928735345602035, | |
| "num_tokens": 3527694.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 2.146635016798973, | |
| "epoch": 1.5344, | |
| "grad_norm": 0.731999397277832, | |
| "learning_rate": 1.9216866189242095e-05, | |
| "loss": 1.5634, | |
| "mean_token_accuracy": 0.5988615363836288, | |
| "num_tokens": 3600277.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 2.1472962319850923, | |
| "epoch": 1.5664, | |
| "grad_norm": 0.7493702173233032, | |
| "learning_rate": 1.9176230679257547e-05, | |
| "loss": 1.5891, | |
| "mean_token_accuracy": 0.5858126983046532, | |
| "num_tokens": 3674781.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 2.1530486762523653, | |
| "epoch": 1.5984, | |
| "grad_norm": 0.8006687164306641, | |
| "learning_rate": 1.9134612614751865e-05, | |
| "loss": 1.5674, | |
| "mean_token_accuracy": 0.5904534175992012, | |
| "num_tokens": 3748434.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 2.169738906621933, | |
| "epoch": 1.6303999999999998, | |
| "grad_norm": 0.9293455481529236, | |
| "learning_rate": 1.909201645202294e-05, | |
| "loss": 1.6104, | |
| "mean_token_accuracy": 0.5860036969184875, | |
| "num_tokens": 3823982.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 2.178475347161293, | |
| "epoch": 1.6623999999999999, | |
| "grad_norm": 0.7716575860977173, | |
| "learning_rate": 1.904844675209956e-05, | |
| "loss": 1.6432, | |
| "mean_token_accuracy": 0.5838924221694469, | |
| "num_tokens": 3900064.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 2.1585603266954423, | |
| "epoch": 1.6944, | |
| "grad_norm": 0.8225084543228149, | |
| "learning_rate": 1.9003908180253027e-05, | |
| "loss": 1.5957, | |
| "mean_token_accuracy": 0.5880116850137711, | |
| "num_tokens": 3974029.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 2.111869788169861, | |
| "epoch": 1.7264, | |
| "grad_norm": 0.7035638093948364, | |
| "learning_rate": 1.8958405505497613e-05, | |
| "loss": 1.579, | |
| "mean_token_accuracy": 0.5890362292528153, | |
| "num_tokens": 4049974.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 2.144411253929138, | |
| "epoch": 1.7584, | |
| "grad_norm": 0.7046850919723511, | |
| "learning_rate": 1.8911943600079934e-05, | |
| "loss": 1.5926, | |
| "mean_token_accuracy": 0.5874261602759361, | |
| "num_tokens": 4125206.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 2.1093025386333464, | |
| "epoch": 1.7904, | |
| "grad_norm": 0.807727575302124, | |
| "learning_rate": 1.8864527438957223e-05, | |
| "loss": 1.5367, | |
| "mean_token_accuracy": 0.5988967984914779, | |
| "num_tokens": 4199365.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 2.097169244289398, | |
| "epoch": 1.8224, | |
| "grad_norm": 0.7856780886650085, | |
| "learning_rate": 1.881616209926465e-05, | |
| "loss": 1.561, | |
| "mean_token_accuracy": 0.5948230788111687, | |
| "num_tokens": 4275889.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 2.088553088903427, | |
| "epoch": 1.8544, | |
| "grad_norm": 0.8993458151817322, | |
| "learning_rate": 1.876685275977167e-05, | |
| "loss": 1.5557, | |
| "mean_token_accuracy": 0.5941933646798134, | |
| "num_tokens": 4350502.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 2.132419008016586, | |
| "epoch": 1.8864, | |
| "grad_norm": 0.7769711017608643, | |
| "learning_rate": 1.8716604700327516e-05, | |
| "loss": 1.6105, | |
| "mean_token_accuracy": 0.5815305605530738, | |
| "num_tokens": 4426429.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 2.1076891005039213, | |
| "epoch": 1.9184, | |
| "grad_norm": 0.9261249899864197, | |
| "learning_rate": 1.866542330129583e-05, | |
| "loss": 1.5307, | |
| "mean_token_accuracy": 0.5964644759893417, | |
| "num_tokens": 4500147.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 2.114642283320427, | |
| "epoch": 1.9504000000000001, | |
| "grad_norm": 0.806425929069519, | |
| "learning_rate": 1.8613314042978576e-05, | |
| "loss": 1.5809, | |
| "mean_token_accuracy": 0.5901800125837326, | |
| "num_tokens": 4573438.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 2.1167576968669892, | |
| "epoch": 1.9824000000000002, | |
| "grad_norm": 0.8191499710083008, | |
| "learning_rate": 1.856028250502923e-05, | |
| "loss": 1.6031, | |
| "mean_token_accuracy": 0.5843381330370903, | |
| "num_tokens": 4648156.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 2.0566249019221257, | |
| "epoch": 2.0128, | |
| "grad_norm": 0.7406135201454163, | |
| "learning_rate": 1.8506334365855315e-05, | |
| "loss": 1.5187, | |
| "mean_token_accuracy": 0.6027438483740154, | |
| "num_tokens": 4719492.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 2.0126763731241226, | |
| "epoch": 2.0448, | |
| "grad_norm": 0.8845784068107605, | |
| "learning_rate": 1.8451475402010405e-05, | |
| "loss": 1.4841, | |
| "mean_token_accuracy": 0.6069207280874253, | |
| "num_tokens": 4796271.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 2.0516900300979612, | |
| "epoch": 2.0768, | |
| "grad_norm": 0.9927017092704773, | |
| "learning_rate": 1.8395711487575564e-05, | |
| "loss": 1.512, | |
| "mean_token_accuracy": 0.6031922519207, | |
| "num_tokens": 4870202.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 2.0824343889951704, | |
| "epoch": 2.1088, | |
| "grad_norm": 0.927236795425415, | |
| "learning_rate": 1.8339048593530406e-05, | |
| "loss": 1.5843, | |
| "mean_token_accuracy": 0.5952437989413738, | |
| "num_tokens": 4945568.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 2.0304481953382494, | |
| "epoch": 2.1408, | |
| "grad_norm": 0.874019205570221, | |
| "learning_rate": 1.8281492787113707e-05, | |
| "loss": 1.5096, | |
| "mean_token_accuracy": 0.5992600306868553, | |
| "num_tokens": 5020723.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 2.0402441143989565, | |
| "epoch": 2.1728, | |
| "grad_norm": 0.8746942281723022, | |
| "learning_rate": 1.8223050231173802e-05, | |
| "loss": 1.5119, | |
| "mean_token_accuracy": 0.5994458049535751, | |
| "num_tokens": 5095780.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 2.018441066145897, | |
| "epoch": 2.2048, | |
| "grad_norm": 1.063180923461914, | |
| "learning_rate": 1.816372718350864e-05, | |
| "loss": 1.4923, | |
| "mean_token_accuracy": 0.6064845189452172, | |
| "num_tokens": 5169733.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 2.0563316702842713, | |
| "epoch": 2.2368, | |
| "grad_norm": 1.0281789302825928, | |
| "learning_rate": 1.810352999619574e-05, | |
| "loss": 1.5505, | |
| "mean_token_accuracy": 0.602813882380724, | |
| "num_tokens": 5246393.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 2.0298285841941834, | |
| "epoch": 2.2688, | |
| "grad_norm": 1.070520281791687, | |
| "learning_rate": 1.804246511491206e-05, | |
| "loss": 1.5159, | |
| "mean_token_accuracy": 0.6006126523017883, | |
| "num_tokens": 5322244.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 2.0195819228887557, | |
| "epoch": 2.3008, | |
| "grad_norm": 0.9672983884811401, | |
| "learning_rate": 1.7980539078243783e-05, | |
| "loss": 1.5166, | |
| "mean_token_accuracy": 0.6054230839014053, | |
| "num_tokens": 5399317.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 2.045917159318924, | |
| "epoch": 2.3327999999999998, | |
| "grad_norm": 1.1228744983673096, | |
| "learning_rate": 1.791775851698622e-05, | |
| "loss": 1.5096, | |
| "mean_token_accuracy": 0.6015639662742615, | |
| "num_tokens": 5473195.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 2.0935415983200074, | |
| "epoch": 2.3648, | |
| "grad_norm": 1.149794578552246, | |
| "learning_rate": 1.7854130153433785e-05, | |
| "loss": 1.5583, | |
| "mean_token_accuracy": 0.5921522840857506, | |
| "num_tokens": 5548357.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 2.044076007604599, | |
| "epoch": 2.3968, | |
| "grad_norm": 1.063625693321228, | |
| "learning_rate": 1.7789660800660222e-05, | |
| "loss": 1.5013, | |
| "mean_token_accuracy": 0.5974589124321937, | |
| "num_tokens": 5620915.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 2.092478734254837, | |
| "epoch": 2.4288, | |
| "grad_norm": 1.1822012662887573, | |
| "learning_rate": 1.7724357361789075e-05, | |
| "loss": 1.5552, | |
| "mean_token_accuracy": 0.5929681301116944, | |
| "num_tokens": 5693406.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 2.0430804908275606, | |
| "epoch": 2.4608, | |
| "grad_norm": 0.9921984076499939, | |
| "learning_rate": 1.765822682925453e-05, | |
| "loss": 1.4944, | |
| "mean_token_accuracy": 0.6029774472117424, | |
| "num_tokens": 5770143.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 2.049290281534195, | |
| "epoch": 2.4928, | |
| "grad_norm": 1.0144131183624268, | |
| "learning_rate": 1.7591276284052695e-05, | |
| "loss": 1.5437, | |
| "mean_token_accuracy": 0.5986773043870925, | |
| "num_tokens": 5844022.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 2.033898201584816, | |
| "epoch": 2.5248, | |
| "grad_norm": 1.1700315475463867, | |
| "learning_rate": 1.7523512894983396e-05, | |
| "loss": 1.5197, | |
| "mean_token_accuracy": 0.5972102269530296, | |
| "num_tokens": 5919099.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 2.03344586789608, | |
| "epoch": 2.5568, | |
| "grad_norm": 1.0503427982330322, | |
| "learning_rate": 1.745494391788257e-05, | |
| "loss": 1.5456, | |
| "mean_token_accuracy": 0.6011263683438302, | |
| "num_tokens": 5997797.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 2.0796399265527725, | |
| "epoch": 2.5888, | |
| "grad_norm": 1.0316176414489746, | |
| "learning_rate": 1.7385576694845324e-05, | |
| "loss": 1.608, | |
| "mean_token_accuracy": 0.6024919278919697, | |
| "num_tokens": 6075434.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 2.0257797837257385, | |
| "epoch": 2.6208, | |
| "grad_norm": 1.048309087753296, | |
| "learning_rate": 1.7315418653439802e-05, | |
| "loss": 1.4876, | |
| "mean_token_accuracy": 0.6070949509739876, | |
| "num_tokens": 6149232.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 2.024846690893173, | |
| "epoch": 2.6528, | |
| "grad_norm": 1.186710000038147, | |
| "learning_rate": 1.7244477305911845e-05, | |
| "loss": 1.499, | |
| "mean_token_accuracy": 0.6022308841347694, | |
| "num_tokens": 6222180.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.9938248336315154, | |
| "epoch": 2.6848, | |
| "grad_norm": 1.1091604232788086, | |
| "learning_rate": 1.717276024838062e-05, | |
| "loss": 1.4795, | |
| "mean_token_accuracy": 0.6044012248516083, | |
| "num_tokens": 6296902.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.9988998174667358, | |
| "epoch": 2.7168, | |
| "grad_norm": 1.0359690189361572, | |
| "learning_rate": 1.710027516002526e-05, | |
| "loss": 1.5173, | |
| "mean_token_accuracy": 0.6025070771574974, | |
| "num_tokens": 6373494.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 2.02343093752861, | |
| "epoch": 2.7488, | |
| "grad_norm": 1.1783568859100342, | |
| "learning_rate": 1.7027029802262598e-05, | |
| "loss": 1.5146, | |
| "mean_token_accuracy": 0.6033479735255242, | |
| "num_tokens": 6449229.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 2.0429257422685625, | |
| "epoch": 2.7808, | |
| "grad_norm": 0.9909568428993225, | |
| "learning_rate": 1.6953032017916115e-05, | |
| "loss": 1.5473, | |
| "mean_token_accuracy": 0.5932901218533516, | |
| "num_tokens": 6525728.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 2.0058376491069794, | |
| "epoch": 2.8128, | |
| "grad_norm": 1.0904430150985718, | |
| "learning_rate": 1.687828973037615e-05, | |
| "loss": 1.4545, | |
| "mean_token_accuracy": 0.6120153024792672, | |
| "num_tokens": 6599335.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 2.005480855703354, | |
| "epoch": 2.8448, | |
| "grad_norm": 1.1638548374176025, | |
| "learning_rate": 1.6802810942751514e-05, | |
| "loss": 1.4887, | |
| "mean_token_accuracy": 0.6060751393437386, | |
| "num_tokens": 6672722.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 2.0311779022216796, | |
| "epoch": 2.8768000000000002, | |
| "grad_norm": 1.1404571533203125, | |
| "learning_rate": 1.6726603737012527e-05, | |
| "loss": 1.5238, | |
| "mean_token_accuracy": 0.6015868663787842, | |
| "num_tokens": 6748069.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 2.0126856863498688, | |
| "epoch": 2.9088000000000003, | |
| "grad_norm": 1.0942543745040894, | |
| "learning_rate": 1.6649676273125647e-05, | |
| "loss": 1.4984, | |
| "mean_token_accuracy": 0.6019899815320968, | |
| "num_tokens": 6820935.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.9961138010025024, | |
| "epoch": 2.9408, | |
| "grad_norm": 1.0870610475540161, | |
| "learning_rate": 1.6572036788179728e-05, | |
| "loss": 1.4962, | |
| "mean_token_accuracy": 0.6030571654438972, | |
| "num_tokens": 6896286.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 2.035824549198151, | |
| "epoch": 2.9728, | |
| "grad_norm": 1.0822633504867554, | |
| "learning_rate": 1.6493693595504022e-05, | |
| "loss": 1.5354, | |
| "mean_token_accuracy": 0.5986709952354431, | |
| "num_tokens": 6971854.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 2.0243908260997974, | |
| "epoch": 3.0032, | |
| "grad_norm": 1.0899602174758911, | |
| "learning_rate": 1.6414655083778027e-05, | |
| "loss": 1.5032, | |
| "mean_token_accuracy": 0.5983682243447555, | |
| "num_tokens": 7041122.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.9538823068141937, | |
| "epoch": 3.0352, | |
| "grad_norm": 1.3042237758636475, | |
| "learning_rate": 1.633492971613326e-05, | |
| "loss": 1.4604, | |
| "mean_token_accuracy": 0.6146818101406097, | |
| "num_tokens": 7116032.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.9383916020393372, | |
| "epoch": 3.0672, | |
| "grad_norm": 1.397078037261963, | |
| "learning_rate": 1.6254526029247048e-05, | |
| "loss": 1.4019, | |
| "mean_token_accuracy": 0.6210932344198227, | |
| "num_tokens": 7189009.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.9460978150367736, | |
| "epoch": 3.0992, | |
| "grad_norm": 1.2756887674331665, | |
| "learning_rate": 1.617345263242847e-05, | |
| "loss": 1.4623, | |
| "mean_token_accuracy": 0.6121616646647453, | |
| "num_tokens": 7263068.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.9156711965799331, | |
| "epoch": 3.1312, | |
| "grad_norm": 1.1937649250030518, | |
| "learning_rate": 1.609171820669649e-05, | |
| "loss": 1.4301, | |
| "mean_token_accuracy": 0.6136599197983742, | |
| "num_tokens": 7338652.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.9247682303190232, | |
| "epoch": 3.1632, | |
| "grad_norm": 1.3291118144989014, | |
| "learning_rate": 1.6009331503850448e-05, | |
| "loss": 1.4545, | |
| "mean_token_accuracy": 0.6153608947992325, | |
| "num_tokens": 7414529.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.9066543668508529, | |
| "epoch": 3.1952, | |
| "grad_norm": 1.4356389045715332, | |
| "learning_rate": 1.5926301345532925e-05, | |
| "loss": 1.4413, | |
| "mean_token_accuracy": 0.612147618830204, | |
| "num_tokens": 7489106.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.895160937309265, | |
| "epoch": 3.2272, | |
| "grad_norm": 1.4345523118972778, | |
| "learning_rate": 1.5842636622285187e-05, | |
| "loss": 1.4207, | |
| "mean_token_accuracy": 0.6138400137424469, | |
| "num_tokens": 7564304.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.9546802312135696, | |
| "epoch": 3.2592, | |
| "grad_norm": 1.5242680311203003, | |
| "learning_rate": 1.575834629259519e-05, | |
| "loss": 1.4435, | |
| "mean_token_accuracy": 0.6153354361653328, | |
| "num_tokens": 7637409.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.912938117980957, | |
| "epoch": 3.2912, | |
| "grad_norm": 1.529726505279541, | |
| "learning_rate": 1.5673439381938365e-05, | |
| "loss": 1.4409, | |
| "mean_token_accuracy": 0.6191004544496537, | |
| "num_tokens": 7711595.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.8989770442247391, | |
| "epoch": 3.3232, | |
| "grad_norm": 1.3367948532104492, | |
| "learning_rate": 1.5587924981811196e-05, | |
| "loss": 1.394, | |
| "mean_token_accuracy": 0.624155393242836, | |
| "num_tokens": 7785750.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.932333904504776, | |
| "epoch": 3.3552, | |
| "grad_norm": 1.4732215404510498, | |
| "learning_rate": 1.5501812248757734e-05, | |
| "loss": 1.3959, | |
| "mean_token_accuracy": 0.6221834555268287, | |
| "num_tokens": 7859036.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.9492982983589173, | |
| "epoch": 3.3872, | |
| "grad_norm": 1.4499313831329346, | |
| "learning_rate": 1.5415110403389166e-05, | |
| "loss": 1.4633, | |
| "mean_token_accuracy": 0.6100246667861938, | |
| "num_tokens": 7933165.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.9063653618097305, | |
| "epoch": 3.4192, | |
| "grad_norm": 1.4364317655563354, | |
| "learning_rate": 1.5327828729396482e-05, | |
| "loss": 1.4216, | |
| "mean_token_accuracy": 0.6210869938135147, | |
| "num_tokens": 8009376.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.9919361650943757, | |
| "epoch": 3.4512, | |
| "grad_norm": 1.5573089122772217, | |
| "learning_rate": 1.5239976572556438e-05, | |
| "loss": 1.5899, | |
| "mean_token_accuracy": 0.5991086520254612, | |
| "num_tokens": 8086825.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.922476476430893, | |
| "epoch": 3.4832, | |
| "grad_norm": 1.3339344263076782, | |
| "learning_rate": 1.5151563339730849e-05, | |
| "loss": 1.4162, | |
| "mean_token_accuracy": 0.6182018965482712, | |
| "num_tokens": 8161726.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.9143129527568816, | |
| "epoch": 3.5152, | |
| "grad_norm": 1.4425708055496216, | |
| "learning_rate": 1.506259849785931e-05, | |
| "loss": 1.4076, | |
| "mean_token_accuracy": 0.6197950705885887, | |
| "num_tokens": 8237046.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.9093267023563385, | |
| "epoch": 3.5472, | |
| "grad_norm": 1.5437992811203003, | |
| "learning_rate": 1.497309157294555e-05, | |
| "loss": 1.4339, | |
| "mean_token_accuracy": 0.6177847877144813, | |
| "num_tokens": 8315350.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.9121424347162246, | |
| "epoch": 3.5792, | |
| "grad_norm": 1.3761622905731201, | |
| "learning_rate": 1.4883052149037395e-05, | |
| "loss": 1.4175, | |
| "mean_token_accuracy": 0.6171463698148727, | |
| "num_tokens": 8390383.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.883551675081253, | |
| "epoch": 3.6112, | |
| "grad_norm": 1.36739182472229, | |
| "learning_rate": 1.479248986720057e-05, | |
| "loss": 1.4158, | |
| "mean_token_accuracy": 0.6186214044690133, | |
| "num_tokens": 8468414.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.988349151611328, | |
| "epoch": 3.6432, | |
| "grad_norm": 1.4566738605499268, | |
| "learning_rate": 1.4701414424486353e-05, | |
| "loss": 1.5366, | |
| "mean_token_accuracy": 0.6110676810145378, | |
| "num_tokens": 8541715.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.9057112097740174, | |
| "epoch": 3.6752000000000002, | |
| "grad_norm": 1.499079704284668, | |
| "learning_rate": 1.4609835572893266e-05, | |
| "loss": 1.3991, | |
| "mean_token_accuracy": 0.6208718970417977, | |
| "num_tokens": 8615694.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.9219326049089431, | |
| "epoch": 3.7072000000000003, | |
| "grad_norm": 1.3865621089935303, | |
| "learning_rate": 1.4517763118322861e-05, | |
| "loss": 1.431, | |
| "mean_token_accuracy": 0.6143050745129586, | |
| "num_tokens": 8692473.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.9036399960517882, | |
| "epoch": 3.7392, | |
| "grad_norm": 1.5362603664398193, | |
| "learning_rate": 1.4425206919529747e-05, | |
| "loss": 1.4156, | |
| "mean_token_accuracy": 0.6199175521731377, | |
| "num_tokens": 8767618.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.9499989479780198, | |
| "epoch": 3.7712, | |
| "grad_norm": 1.663404941558838, | |
| "learning_rate": 1.4332176887065955e-05, | |
| "loss": 1.4668, | |
| "mean_token_accuracy": 0.605186915397644, | |
| "num_tokens": 8843100.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.9545456051826477, | |
| "epoch": 3.8032, | |
| "grad_norm": 1.6169345378875732, | |
| "learning_rate": 1.4238682982219753e-05, | |
| "loss": 1.4241, | |
| "mean_token_accuracy": 0.6206902250647545, | |
| "num_tokens": 8914604.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.9130536198616028, | |
| "epoch": 3.8352, | |
| "grad_norm": 1.472740650177002, | |
| "learning_rate": 1.4144735215949028e-05, | |
| "loss": 1.4271, | |
| "mean_token_accuracy": 0.6139126420021057, | |
| "num_tokens": 8989305.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.938635140657425, | |
| "epoch": 3.8672, | |
| "grad_norm": 1.4194226264953613, | |
| "learning_rate": 1.4050343647809354e-05, | |
| "loss": 1.4538, | |
| "mean_token_accuracy": 0.6131341770291329, | |
| "num_tokens": 9065589.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.9123675346374511, | |
| "epoch": 3.8992, | |
| "grad_norm": 1.5208053588867188, | |
| "learning_rate": 1.3955518384876863e-05, | |
| "loss": 1.4309, | |
| "mean_token_accuracy": 0.6139545351266861, | |
| "num_tokens": 9140150.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 1.9148090302944183, | |
| "epoch": 3.9312, | |
| "grad_norm": 1.6418218612670898, | |
| "learning_rate": 1.3860269580666004e-05, | |
| "loss": 1.4269, | |
| "mean_token_accuracy": 0.6169310078024864, | |
| "num_tokens": 9215796.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.9157740741968154, | |
| "epoch": 3.9632, | |
| "grad_norm": 1.4638084173202515, | |
| "learning_rate": 1.3764607434042353e-05, | |
| "loss": 1.4509, | |
| "mean_token_accuracy": 0.6164968460798264, | |
| "num_tokens": 9291010.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.9184510678052902, | |
| "epoch": 3.9952, | |
| "grad_norm": 1.5152716636657715, | |
| "learning_rate": 1.3668542188130567e-05, | |
| "loss": 1.4453, | |
| "mean_token_accuracy": 0.6112410992383956, | |
| "num_tokens": 9367186.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.9016748384425515, | |
| "epoch": 4.0256, | |
| "grad_norm": 1.490628719329834, | |
| "learning_rate": 1.3572084129217566e-05, | |
| "loss": 1.382, | |
| "mean_token_accuracy": 0.623968276538347, | |
| "num_tokens": 9439028.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.8026290327310561, | |
| "epoch": 4.0576, | |
| "grad_norm": 1.8969308137893677, | |
| "learning_rate": 1.347524358565115e-05, | |
| "loss": 1.3128, | |
| "mean_token_accuracy": 0.6386646836996078, | |
| "num_tokens": 9513855.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.8283424764871596, | |
| "epoch": 4.0896, | |
| "grad_norm": 1.5952194929122925, | |
| "learning_rate": 1.3378030926734052e-05, | |
| "loss": 1.3362, | |
| "mean_token_accuracy": 0.6328515768051147, | |
| "num_tokens": 9589080.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.8405955344438554, | |
| "epoch": 4.1216, | |
| "grad_norm": 1.6057584285736084, | |
| "learning_rate": 1.3280456561613653e-05, | |
| "loss": 1.4151, | |
| "mean_token_accuracy": 0.6261398203670978, | |
| "num_tokens": 9666808.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.8390818655490875, | |
| "epoch": 4.1536, | |
| "grad_norm": 1.8149824142456055, | |
| "learning_rate": 1.3182530938167409e-05, | |
| "loss": 1.3455, | |
| "mean_token_accuracy": 0.6318597674369812, | |
| "num_tokens": 9740267.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.8203887075185776, | |
| "epoch": 4.1856, | |
| "grad_norm": 1.6102676391601562, | |
| "learning_rate": 1.3084264541884118e-05, | |
| "loss": 1.3255, | |
| "mean_token_accuracy": 0.6316933467984199, | |
| "num_tokens": 9816400.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.8592366576194763, | |
| "epoch": 4.2176, | |
| "grad_norm": 1.9501773118972778, | |
| "learning_rate": 1.2985667894741197e-05, | |
| "loss": 1.3521, | |
| "mean_token_accuracy": 0.6301594719290733, | |
| "num_tokens": 9889311.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.8420085966587068, | |
| "epoch": 4.2496, | |
| "grad_norm": 1.6526106595993042, | |
| "learning_rate": 1.2886751554078015e-05, | |
| "loss": 1.3662, | |
| "mean_token_accuracy": 0.6302071824669838, | |
| "num_tokens": 9965339.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 1.8313881188631058, | |
| "epoch": 4.2816, | |
| "grad_norm": 1.6269904375076294, | |
| "learning_rate": 1.2787526111465453e-05, | |
| "loss": 1.3579, | |
| "mean_token_accuracy": 0.6328388035297394, | |
| "num_tokens": 10039668.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.858151137828827, | |
| "epoch": 4.3136, | |
| "grad_norm": 1.9028024673461914, | |
| "learning_rate": 1.2688002191571829e-05, | |
| "loss": 1.421, | |
| "mean_token_accuracy": 0.6276688367128372, | |
| "num_tokens": 10115387.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.8273844957351684, | |
| "epoch": 4.3456, | |
| "grad_norm": 1.7530555725097656, | |
| "learning_rate": 1.2588190451025209e-05, | |
| "loss": 1.3527, | |
| "mean_token_accuracy": 0.6345869660377502, | |
| "num_tokens": 10191506.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.8732422679662704, | |
| "epoch": 4.3776, | |
| "grad_norm": 1.7372691631317139, | |
| "learning_rate": 1.248810157727236e-05, | |
| "loss": 1.4132, | |
| "mean_token_accuracy": 0.6252246856689453, | |
| "num_tokens": 10268756.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.8583054572343827, | |
| "epoch": 4.4096, | |
| "grad_norm": 1.6993470191955566, | |
| "learning_rate": 1.2387746287434385e-05, | |
| "loss": 1.3638, | |
| "mean_token_accuracy": 0.6286717876791954, | |
| "num_tokens": 10341779.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.8324467271566391, | |
| "epoch": 4.4416, | |
| "grad_norm": 1.7818169593811035, | |
| "learning_rate": 1.2287135327159165e-05, | |
| "loss": 1.3372, | |
| "mean_token_accuracy": 0.6361263945698739, | |
| "num_tokens": 10414642.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.8514392852783204, | |
| "epoch": 4.4736, | |
| "grad_norm": 1.7585517168045044, | |
| "learning_rate": 1.2186279469470757e-05, | |
| "loss": 1.3697, | |
| "mean_token_accuracy": 0.628801380097866, | |
| "num_tokens": 10489517.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.8218136370182036, | |
| "epoch": 4.5056, | |
| "grad_norm": 1.9843116998672485, | |
| "learning_rate": 1.2085189513615872e-05, | |
| "loss": 1.3628, | |
| "mean_token_accuracy": 0.6295172199606895, | |
| "num_tokens": 10565467.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 1.8919565021991729, | |
| "epoch": 4.5376, | |
| "grad_norm": 1.9309132099151611, | |
| "learning_rate": 1.1983876283907522e-05, | |
| "loss": 1.4467, | |
| "mean_token_accuracy": 0.6263746194541454, | |
| "num_tokens": 10641283.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 1.8356508910655975, | |
| "epoch": 4.5696, | |
| "grad_norm": 1.7685068845748901, | |
| "learning_rate": 1.1882350628566008e-05, | |
| "loss": 1.3631, | |
| "mean_token_accuracy": 0.624418406188488, | |
| "num_tokens": 10716701.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.8288098931312562, | |
| "epoch": 4.6016, | |
| "grad_norm": 1.8276050090789795, | |
| "learning_rate": 1.178062341855732e-05, | |
| "loss": 1.3619, | |
| "mean_token_accuracy": 0.6286922857165337, | |
| "num_tokens": 10791427.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.8557640790939331, | |
| "epoch": 4.6336, | |
| "grad_norm": 1.7773240804672241, | |
| "learning_rate": 1.1678705546429132e-05, | |
| "loss": 1.383, | |
| "mean_token_accuracy": 0.6216814562678337, | |
| "num_tokens": 10866356.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.8483826667070389, | |
| "epoch": 4.6655999999999995, | |
| "grad_norm": 1.831931710243225, | |
| "learning_rate": 1.1576607925144456e-05, | |
| "loss": 1.355, | |
| "mean_token_accuracy": 0.6278511002659798, | |
| "num_tokens": 10940772.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.8824394553899766, | |
| "epoch": 4.6975999999999996, | |
| "grad_norm": 1.9213542938232422, | |
| "learning_rate": 1.1474341486913146e-05, | |
| "loss": 1.3767, | |
| "mean_token_accuracy": 0.6256057649850846, | |
| "num_tokens": 11016144.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 1.8709469974040984, | |
| "epoch": 4.7296, | |
| "grad_norm": 1.8768925666809082, | |
| "learning_rate": 1.1371917182021297e-05, | |
| "loss": 1.3734, | |
| "mean_token_accuracy": 0.6317574754357338, | |
| "num_tokens": 11089939.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 1.8673742085695266, | |
| "epoch": 4.7616, | |
| "grad_norm": 1.796302318572998, | |
| "learning_rate": 1.1269345977658747e-05, | |
| "loss": 1.3707, | |
| "mean_token_accuracy": 0.6238353416323662, | |
| "num_tokens": 11166087.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.8310889720916748, | |
| "epoch": 4.7936, | |
| "grad_norm": 1.8969939947128296, | |
| "learning_rate": 1.1166638856744747e-05, | |
| "loss": 1.3373, | |
| "mean_token_accuracy": 0.6348015949130058, | |
| "num_tokens": 11240732.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.8809226244688033, | |
| "epoch": 4.8256, | |
| "grad_norm": 1.642104983329773, | |
| "learning_rate": 1.1063806816751957e-05, | |
| "loss": 1.3792, | |
| "mean_token_accuracy": 0.6250617265701294, | |
| "num_tokens": 11316878.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 1.8715822875499726, | |
| "epoch": 4.8576, | |
| "grad_norm": 1.962158441543579, | |
| "learning_rate": 1.0960860868528872e-05, | |
| "loss": 1.3711, | |
| "mean_token_accuracy": 0.6293752744793892, | |
| "num_tokens": 11389042.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 1.8657191127538681, | |
| "epoch": 4.8896, | |
| "grad_norm": 1.9577444791793823, | |
| "learning_rate": 1.0857812035120845e-05, | |
| "loss": 1.379, | |
| "mean_token_accuracy": 0.6259156972169876, | |
| "num_tokens": 11464215.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 1.8811951220035552, | |
| "epoch": 4.9216, | |
| "grad_norm": 2.015150785446167, | |
| "learning_rate": 1.0754671350589752e-05, | |
| "loss": 1.4155, | |
| "mean_token_accuracy": 0.626779156178236, | |
| "num_tokens": 11539122.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 1.863905319571495, | |
| "epoch": 4.9536, | |
| "grad_norm": 1.8474093675613403, | |
| "learning_rate": 1.065144985883253e-05, | |
| "loss": 1.3409, | |
| "mean_token_accuracy": 0.6319419264793396, | |
| "num_tokens": 11613016.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.836970153450966, | |
| "epoch": 4.9856, | |
| "grad_norm": 1.8822177648544312, | |
| "learning_rate": 1.054815861239864e-05, | |
| "loss": 1.3514, | |
| "mean_token_accuracy": 0.6292115703225136, | |
| "num_tokens": 11688143.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.8377465950815302, | |
| "epoch": 5.016, | |
| "grad_norm": 1.8221346139907837, | |
| "learning_rate": 1.0444808671306588e-05, | |
| "loss": 1.3028, | |
| "mean_token_accuracy": 0.6413120329380035, | |
| "num_tokens": 11758768.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 1.7883025139570237, | |
| "epoch": 5.048, | |
| "grad_norm": 2.1959595680236816, | |
| "learning_rate": 1.034141110185968e-05, | |
| "loss": 1.2797, | |
| "mean_token_accuracy": 0.6479741290211678, | |
| "num_tokens": 11832210.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 1.7955584406852723, | |
| "epoch": 5.08, | |
| "grad_norm": 2.106905698776245, | |
| "learning_rate": 1.0237976975461074e-05, | |
| "loss": 1.2881, | |
| "mean_token_accuracy": 0.6424632370471954, | |
| "num_tokens": 11906115.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 1.7998322755098344, | |
| "epoch": 5.112, | |
| "grad_norm": 2.327314615249634, | |
| "learning_rate": 1.0134517367428309e-05, | |
| "loss": 1.3324, | |
| "mean_token_accuracy": 0.6460248224437237, | |
| "num_tokens": 11981328.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.7885828018188477, | |
| "epoch": 5.144, | |
| "grad_norm": 2.1001713275909424, | |
| "learning_rate": 1.0031043355807386e-05, | |
| "loss": 1.3098, | |
| "mean_token_accuracy": 0.63900758177042, | |
| "num_tokens": 12056453.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.769435602426529, | |
| "epoch": 5.176, | |
| "grad_norm": 2.1210567951202393, | |
| "learning_rate": 9.927566020186592e-06, | |
| "loss": 1.2892, | |
| "mean_token_accuracy": 0.6432970002293587, | |
| "num_tokens": 12133433.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 1.7907766073942184, | |
| "epoch": 5.208, | |
| "grad_norm": 2.1842658519744873, | |
| "learning_rate": 9.82409644051013e-06, | |
| "loss": 1.2856, | |
| "mean_token_accuracy": 0.6423615619540215, | |
| "num_tokens": 12207150.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 1.7834827870130538, | |
| "epoch": 5.24, | |
| "grad_norm": 2.2503459453582764, | |
| "learning_rate": 9.720645695891733e-06, | |
| "loss": 1.3066, | |
| "mean_token_accuracy": 0.6417693704366684, | |
| "num_tokens": 12282584.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 1.763256973028183, | |
| "epoch": 5.272, | |
| "grad_norm": 1.9505388736724854, | |
| "learning_rate": 9.617224863428346e-06, | |
| "loss": 1.2951, | |
| "mean_token_accuracy": 0.6429389595985413, | |
| "num_tokens": 12359793.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.8142763644456863, | |
| "epoch": 5.304, | |
| "grad_norm": 1.9957698583602905, | |
| "learning_rate": 9.513845017014048e-06, | |
| "loss": 1.33, | |
| "mean_token_accuracy": 0.6413653999567032, | |
| "num_tokens": 12434251.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 1.797221601009369, | |
| "epoch": 5.336, | |
| "grad_norm": 2.5095462799072266, | |
| "learning_rate": 9.410517226154276e-06, | |
| "loss": 1.2978, | |
| "mean_token_accuracy": 0.6389835774898529, | |
| "num_tokens": 12508416.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 1.8157870292663574, | |
| "epoch": 5.368, | |
| "grad_norm": 2.1890602111816406, | |
| "learning_rate": 9.30725255478058e-06, | |
| "loss": 1.3183, | |
| "mean_token_accuracy": 0.6448161751031876, | |
| "num_tokens": 12582896.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 1.7990054041147232, | |
| "epoch": 5.4, | |
| "grad_norm": 2.3904025554656982, | |
| "learning_rate": 9.204062060065915e-06, | |
| "loss": 1.3318, | |
| "mean_token_accuracy": 0.636146479845047, | |
| "num_tokens": 12656802.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 1.8003453463315964, | |
| "epoch": 5.432, | |
| "grad_norm": 1.9204304218292236, | |
| "learning_rate": 9.100956791240699e-06, | |
| "loss": 1.3186, | |
| "mean_token_accuracy": 0.6372130662202835, | |
| "num_tokens": 12733283.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.8101116061210631, | |
| "epoch": 5.464, | |
| "grad_norm": 2.009500026702881, | |
| "learning_rate": 8.997947788409696e-06, | |
| "loss": 1.3247, | |
| "mean_token_accuracy": 0.6406339526176452, | |
| "num_tokens": 12810272.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.764935952425003, | |
| "epoch": 5.496, | |
| "grad_norm": 2.2038798332214355, | |
| "learning_rate": 8.89504608136989e-06, | |
| "loss": 1.2792, | |
| "mean_token_accuracy": 0.6445836886763573, | |
| "num_tokens": 12885633.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 1.7950240582227708, | |
| "epoch": 5.5280000000000005, | |
| "grad_norm": 2.0160531997680664, | |
| "learning_rate": 8.792262688429445e-06, | |
| "loss": 1.2934, | |
| "mean_token_accuracy": 0.6469692558050155, | |
| "num_tokens": 12961131.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 1.7804677098989488, | |
| "epoch": 5.5600000000000005, | |
| "grad_norm": 2.1956582069396973, | |
| "learning_rate": 8.689608615227933e-06, | |
| "loss": 1.2969, | |
| "mean_token_accuracy": 0.6438389763236045, | |
| "num_tokens": 13036481.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 1.7932062089443206, | |
| "epoch": 5.592, | |
| "grad_norm": 2.2215394973754883, | |
| "learning_rate": 8.587094853557877e-06, | |
| "loss": 1.2907, | |
| "mean_token_accuracy": 0.6460438340902328, | |
| "num_tokens": 13111001.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.8026408910751344, | |
| "epoch": 5.624, | |
| "grad_norm": 2.3881425857543945, | |
| "learning_rate": 8.484732380187785e-06, | |
| "loss": 1.3049, | |
| "mean_token_accuracy": 0.6414234206080437, | |
| "num_tokens": 13186347.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 1.8440747499465941, | |
| "epoch": 5.656, | |
| "grad_norm": 2.2154159545898438, | |
| "learning_rate": 8.382532155686825e-06, | |
| "loss": 1.3797, | |
| "mean_token_accuracy": 0.6365857936441899, | |
| "num_tokens": 13261455.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 1.7975190997123718, | |
| "epoch": 5.688, | |
| "grad_norm": 2.1991233825683594, | |
| "learning_rate": 8.280505123251183e-06, | |
| "loss": 1.3191, | |
| "mean_token_accuracy": 0.6393151715397835, | |
| "num_tokens": 13338064.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.8396487146615983, | |
| "epoch": 5.72, | |
| "grad_norm": 2.0190858840942383, | |
| "learning_rate": 8.178662207532343e-06, | |
| "loss": 1.4052, | |
| "mean_token_accuracy": 0.629064130038023, | |
| "num_tokens": 13414806.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.7840806126594544, | |
| "epoch": 5.752, | |
| "grad_norm": 2.3335204124450684, | |
| "learning_rate": 8.077014313467274e-06, | |
| "loss": 1.2701, | |
| "mean_token_accuracy": 0.6464540064334869, | |
| "num_tokens": 13489075.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.7840022534132003, | |
| "epoch": 5.784, | |
| "grad_norm": 2.2151618003845215, | |
| "learning_rate": 7.975572325110819e-06, | |
| "loss": 1.3248, | |
| "mean_token_accuracy": 0.6358998969197274, | |
| "num_tokens": 13565636.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.7677135676145554, | |
| "epoch": 5.816, | |
| "grad_norm": 2.11505389213562, | |
| "learning_rate": 7.874347104470234e-06, | |
| "loss": 1.2765, | |
| "mean_token_accuracy": 0.6448719501495361, | |
| "num_tokens": 13641112.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 1.7586044907569884, | |
| "epoch": 5.848, | |
| "grad_norm": 2.178250789642334, | |
| "learning_rate": 7.773349490342157e-06, | |
| "loss": 1.2846, | |
| "mean_token_accuracy": 0.6450280979275703, | |
| "num_tokens": 13715158.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 1.8128920108079911, | |
| "epoch": 5.88, | |
| "grad_norm": 2.2499353885650635, | |
| "learning_rate": 7.672590297152013e-06, | |
| "loss": 1.3248, | |
| "mean_token_accuracy": 0.6343795835971833, | |
| "num_tokens": 13791086.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 1.7873643577098846, | |
| "epoch": 5.912, | |
| "grad_norm": 2.1989104747772217, | |
| "learning_rate": 7.572080313796064e-06, | |
| "loss": 1.2907, | |
| "mean_token_accuracy": 0.6413815975189209, | |
| "num_tokens": 13865700.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.790488451719284, | |
| "epoch": 5.944, | |
| "grad_norm": 2.2605504989624023, | |
| "learning_rate": 7.471830302486151e-06, | |
| "loss": 1.2955, | |
| "mean_token_accuracy": 0.6424889475107193, | |
| "num_tokens": 13938540.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.7985246628522873, | |
| "epoch": 5.976, | |
| "grad_norm": 2.3228533267974854, | |
| "learning_rate": 7.371850997597355e-06, | |
| "loss": 1.2854, | |
| "mean_token_accuracy": 0.6457341402769089, | |
| "num_tokens": 14011087.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 1.7890221727521796, | |
| "epoch": 6.0064, | |
| "grad_norm": 2.192910671234131, | |
| "learning_rate": 7.272153104518567e-06, | |
| "loss": 1.2796, | |
| "mean_token_accuracy": 0.6482133128141102, | |
| "num_tokens": 14082075.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 1.7633086562156677, | |
| "epoch": 6.0384, | |
| "grad_norm": 2.368185043334961, | |
| "learning_rate": 7.172747298506224e-06, | |
| "loss": 1.2703, | |
| "mean_token_accuracy": 0.6497290328145027, | |
| "num_tokens": 14156298.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 1.750128635764122, | |
| "epoch": 6.0704, | |
| "grad_norm": 2.36487078666687, | |
| "learning_rate": 7.073644223541227e-06, | |
| "loss": 1.2521, | |
| "mean_token_accuracy": 0.6534707516431808, | |
| "num_tokens": 14232528.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.7416553497314453, | |
| "epoch": 6.1024, | |
| "grad_norm": 2.3927595615386963, | |
| "learning_rate": 6.974854491189243e-06, | |
| "loss": 1.217, | |
| "mean_token_accuracy": 0.6588135868310928, | |
| "num_tokens": 14307073.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 1.7359241485595702, | |
| "epoch": 6.1344, | |
| "grad_norm": 2.1107988357543945, | |
| "learning_rate": 6.876388679464437e-06, | |
| "loss": 1.2763, | |
| "mean_token_accuracy": 0.6550255373120308, | |
| "num_tokens": 14383819.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 1.7380403220653533, | |
| "epoch": 6.1664, | |
| "grad_norm": 2.4158387184143066, | |
| "learning_rate": 6.7782573316968424e-06, | |
| "loss": 1.2359, | |
| "mean_token_accuracy": 0.656632873415947, | |
| "num_tokens": 14460092.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 1.7227638810873032, | |
| "epoch": 6.1984, | |
| "grad_norm": 2.3467485904693604, | |
| "learning_rate": 6.6804709554034075e-06, | |
| "loss": 1.2311, | |
| "mean_token_accuracy": 0.654091839492321, | |
| "num_tokens": 14534160.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.7244948148727417, | |
| "epoch": 6.2304, | |
| "grad_norm": 2.760057210922241, | |
| "learning_rate": 6.583040021162905e-06, | |
| "loss": 1.2189, | |
| "mean_token_accuracy": 0.6611428812146187, | |
| "num_tokens": 14608592.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.7471657902002335, | |
| "epoch": 6.2624, | |
| "grad_norm": 2.3923745155334473, | |
| "learning_rate": 6.485974961494772e-06, | |
| "loss": 1.2631, | |
| "mean_token_accuracy": 0.6524021357297898, | |
| "num_tokens": 14683538.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 1.7506494253873826, | |
| "epoch": 6.2943999999999996, | |
| "grad_norm": 2.4149715900421143, | |
| "learning_rate": 6.389286169742048e-06, | |
| "loss": 1.2468, | |
| "mean_token_accuracy": 0.6567713841795921, | |
| "num_tokens": 14755778.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 1.7104488879442215, | |
| "epoch": 6.3264, | |
| "grad_norm": 2.632632255554199, | |
| "learning_rate": 6.292983998958478e-06, | |
| "loss": 1.2267, | |
| "mean_token_accuracy": 0.6561126798391342, | |
| "num_tokens": 14831036.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 1.7591658294200898, | |
| "epoch": 6.3584, | |
| "grad_norm": 2.4012722969055176, | |
| "learning_rate": 6.1970787607999815e-06, | |
| "loss": 1.2693, | |
| "mean_token_accuracy": 0.6490694522857666, | |
| "num_tokens": 14906610.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 1.7317969173192977, | |
| "epoch": 6.3904, | |
| "grad_norm": 2.8288748264312744, | |
| "learning_rate": 6.101580724420478e-06, | |
| "loss": 1.235, | |
| "mean_token_accuracy": 0.6564200609922409, | |
| "num_tokens": 14980134.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.7617577254772185, | |
| "epoch": 6.4224, | |
| "grad_norm": 2.4008944034576416, | |
| "learning_rate": 6.00650011537235e-06, | |
| "loss": 1.2773, | |
| "mean_token_accuracy": 0.6494350075721741, | |
| "num_tokens": 15054969.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 1.749829688668251, | |
| "epoch": 6.4544, | |
| "grad_norm": 2.5665228366851807, | |
| "learning_rate": 5.911847114511497e-06, | |
| "loss": 1.2512, | |
| "mean_token_accuracy": 0.6512764275074006, | |
| "num_tokens": 15129421.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 1.7387797951698303, | |
| "epoch": 6.4864, | |
| "grad_norm": 2.6020922660827637, | |
| "learning_rate": 5.817631856907233e-06, | |
| "loss": 1.2477, | |
| "mean_token_accuracy": 0.6530226185917855, | |
| "num_tokens": 15203465.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 1.7363551884889603, | |
| "epoch": 6.5184, | |
| "grad_norm": 2.161478281021118, | |
| "learning_rate": 5.723864430757047e-06, | |
| "loss": 1.2692, | |
| "mean_token_accuracy": 0.6527093783020973, | |
| "num_tokens": 15279761.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 1.7563295543193818, | |
| "epoch": 6.5504, | |
| "grad_norm": 2.5587289333343506, | |
| "learning_rate": 5.630554876306407e-06, | |
| "loss": 1.2211, | |
| "mean_token_accuracy": 0.6574550330638885, | |
| "num_tokens": 15351301.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.7521151036024094, | |
| "epoch": 6.5824, | |
| "grad_norm": 2.4042234420776367, | |
| "learning_rate": 5.537713184773686e-06, | |
| "loss": 1.271, | |
| "mean_token_accuracy": 0.6478641331195831, | |
| "num_tokens": 15427936.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 1.7270145863294601, | |
| "epoch": 6.6144, | |
| "grad_norm": 2.3122522830963135, | |
| "learning_rate": 5.44534929728036e-06, | |
| "loss": 1.224, | |
| "mean_token_accuracy": 0.6566437393426895, | |
| "num_tokens": 15502561.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 1.7568089962005615, | |
| "epoch": 6.6464, | |
| "grad_norm": 2.461474895477295, | |
| "learning_rate": 5.353473103786511e-06, | |
| "loss": 1.2864, | |
| "mean_token_accuracy": 0.6471308276057244, | |
| "num_tokens": 15578053.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 1.7190027862787247, | |
| "epoch": 6.6784, | |
| "grad_norm": 2.4412550926208496, | |
| "learning_rate": 5.262094442031901e-06, | |
| "loss": 1.2092, | |
| "mean_token_accuracy": 0.6601713746786118, | |
| "num_tokens": 15653342.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 1.717634916305542, | |
| "epoch": 6.7104, | |
| "grad_norm": 2.276007890701294, | |
| "learning_rate": 5.171223096482533e-06, | |
| "loss": 1.2271, | |
| "mean_token_accuracy": 0.6595920532941818, | |
| "num_tokens": 15730387.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.7230647921562194, | |
| "epoch": 6.7424, | |
| "grad_norm": 2.480471134185791, | |
| "learning_rate": 5.080868797283019e-06, | |
| "loss": 1.229, | |
| "mean_token_accuracy": 0.6568982020020485, | |
| "num_tokens": 15804405.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 1.7535502433776855, | |
| "epoch": 6.7744, | |
| "grad_norm": 2.448997974395752, | |
| "learning_rate": 4.9910412192146795e-06, | |
| "loss": 1.2584, | |
| "mean_token_accuracy": 0.648795773088932, | |
| "num_tokens": 15878537.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 1.786664029955864, | |
| "epoch": 6.8064, | |
| "grad_norm": 2.430039405822754, | |
| "learning_rate": 4.901749980659617e-06, | |
| "loss": 1.3358, | |
| "mean_token_accuracy": 0.6427689291536808, | |
| "num_tokens": 15952964.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 1.7594995677471161, | |
| "epoch": 6.8384, | |
| "grad_norm": 2.469172716140747, | |
| "learning_rate": 4.813004642570822e-06, | |
| "loss": 1.2844, | |
| "mean_token_accuracy": 0.6534359715878963, | |
| "num_tokens": 16028086.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 1.7347292125225067, | |
| "epoch": 6.8704, | |
| "grad_norm": 2.6162445545196533, | |
| "learning_rate": 4.724814707448418e-06, | |
| "loss": 1.2707, | |
| "mean_token_accuracy": 0.6447671175003051, | |
| "num_tokens": 16103263.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.7325938045978546, | |
| "epoch": 6.9024, | |
| "grad_norm": 2.416431188583374, | |
| "learning_rate": 4.637189618322173e-06, | |
| "loss": 1.2794, | |
| "mean_token_accuracy": 0.6439008563756943, | |
| "num_tokens": 16182360.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 1.7763712674379348, | |
| "epoch": 6.9344, | |
| "grad_norm": 2.3447437286376953, | |
| "learning_rate": 4.550138757740381e-06, | |
| "loss": 1.3043, | |
| "mean_token_accuracy": 0.650251479446888, | |
| "num_tokens": 16256272.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 1.739478302001953, | |
| "epoch": 6.9664, | |
| "grad_norm": 2.650451183319092, | |
| "learning_rate": 4.463671446765206e-06, | |
| "loss": 1.259, | |
| "mean_token_accuracy": 0.6514677822589874, | |
| "num_tokens": 16330984.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 1.7292406976222991, | |
| "epoch": 6.9984, | |
| "grad_norm": 2.5442306995391846, | |
| "learning_rate": 4.377796943974641e-06, | |
| "loss": 1.2554, | |
| "mean_token_accuracy": 0.6506337657570839, | |
| "num_tokens": 16406982.0, | |
| "step": 2190 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3130, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5937216534515548e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |