Invalid JSON: Unexpected token 'I', ..."_metric": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": null, | |
| "best_metric": Infinity, | |
| "best_model_checkpoint": null, | |
| "epoch": 50.0, | |
| "eval_steps": 500, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.5664891004562378, | |
| "epoch": 0.05, | |
| "grad_norm": 28.908475875854492, | |
| "learning_rate": 0.0, | |
| "loss": 1.6941, | |
| "mean_token_accuracy": 0.6499292850494385, | |
| "num_tokens": 262040.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 0.5717914700508118, | |
| "epoch": 0.1, | |
| "grad_norm": 28.778432846069336, | |
| "learning_rate": 3.3333333333333334e-08, | |
| "loss": 1.6883, | |
| "mean_token_accuracy": 0.644157350063324, | |
| "num_tokens": 524072.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 0.5704326629638672, | |
| "epoch": 0.15, | |
| "grad_norm": 29.55396842956543, | |
| "learning_rate": 6.666666666666667e-08, | |
| "loss": 1.6709, | |
| "mean_token_accuracy": 0.6549088954925537, | |
| "num_tokens": 786102.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 0.5724536180496216, | |
| "epoch": 0.2, | |
| "grad_norm": 28.995824813842773, | |
| "learning_rate": 1e-07, | |
| "loss": 1.6821, | |
| "mean_token_accuracy": 0.6475409865379333, | |
| "num_tokens": 1048106.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 0.5743661522865295, | |
| "epoch": 0.25, | |
| "grad_norm": 26.334909439086914, | |
| "learning_rate": 1.3333333333333334e-07, | |
| "loss": 1.6381, | |
| "mean_token_accuracy": 0.6527131795883179, | |
| "num_tokens": 1310172.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 0.5727732181549072, | |
| "epoch": 0.3, | |
| "grad_norm": 28.073936462402344, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": 1.6422, | |
| "mean_token_accuracy": 0.6617563962936401, | |
| "num_tokens": 1572233.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 0.57081538438797, | |
| "epoch": 0.35, | |
| "grad_norm": 28.08388900756836, | |
| "learning_rate": 2e-07, | |
| "loss": 1.6665, | |
| "mean_token_accuracy": 0.6370558142662048, | |
| "num_tokens": 1834270.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 0.5670446157455444, | |
| "epoch": 0.4, | |
| "grad_norm": 28.71568489074707, | |
| "learning_rate": 2.3333333333333333e-07, | |
| "loss": 1.6712, | |
| "mean_token_accuracy": 0.6549586653709412, | |
| "num_tokens": 2096348.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 0.5676782131195068, | |
| "epoch": 0.45, | |
| "grad_norm": 27.27318572998047, | |
| "learning_rate": 2.6666666666666667e-07, | |
| "loss": 1.7086, | |
| "mean_token_accuracy": 0.6470588445663452, | |
| "num_tokens": 2358417.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.5776336789131165, | |
| "epoch": 0.5, | |
| "grad_norm": 28.332353591918945, | |
| "learning_rate": 3e-07, | |
| "loss": 1.6415, | |
| "mean_token_accuracy": 0.6566205620765686, | |
| "num_tokens": 2620440.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.572010338306427, | |
| "epoch": 0.55, | |
| "grad_norm": 27.795087814331055, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 1.7014, | |
| "mean_token_accuracy": 0.6489361524581909, | |
| "num_tokens": 2882489.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.5682384967803955, | |
| "epoch": 0.6, | |
| "grad_norm": 28.871315002441406, | |
| "learning_rate": 3.666666666666666e-07, | |
| "loss": 1.7225, | |
| "mean_token_accuracy": 0.6407634615898132, | |
| "num_tokens": 3144549.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.5681391954421997, | |
| "epoch": 0.65, | |
| "grad_norm": 27.233278274536133, | |
| "learning_rate": 4e-07, | |
| "loss": 1.5898, | |
| "mean_token_accuracy": 0.6640344858169556, | |
| "num_tokens": 3406634.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.5735733509063721, | |
| "epoch": 0.7, | |
| "grad_norm": 28.354537963867188, | |
| "learning_rate": 4.3333333333333335e-07, | |
| "loss": 1.6334, | |
| "mean_token_accuracy": 0.6414058208465576, | |
| "num_tokens": 3668652.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.5718874931335449, | |
| "epoch": 0.75, | |
| "grad_norm": 26.07267189025879, | |
| "learning_rate": 4.6666666666666666e-07, | |
| "loss": 1.6714, | |
| "mean_token_accuracy": 0.6530214548110962, | |
| "num_tokens": 3930718.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.5689897537231445, | |
| "epoch": 0.8, | |
| "grad_norm": 26.958057403564453, | |
| "learning_rate": 5e-07, | |
| "loss": 1.6341, | |
| "mean_token_accuracy": 0.6614886522293091, | |
| "num_tokens": 4192790.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.5694440603256226, | |
| "epoch": 0.85, | |
| "grad_norm": 25.388864517211914, | |
| "learning_rate": 5.333333333333333e-07, | |
| "loss": 1.5602, | |
| "mean_token_accuracy": 0.6735086441040039, | |
| "num_tokens": 4454876.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.5694302916526794, | |
| "epoch": 0.9, | |
| "grad_norm": 27.550378799438477, | |
| "learning_rate": 5.666666666666666e-07, | |
| "loss": 1.572, | |
| "mean_token_accuracy": 0.6659559607505798, | |
| "num_tokens": 4716909.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.5698995590209961, | |
| "epoch": 0.95, | |
| "grad_norm": 26.377119064331055, | |
| "learning_rate": 6e-07, | |
| "loss": 1.6109, | |
| "mean_token_accuracy": 0.6562277674674988, | |
| "num_tokens": 4978973.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.566437304019928, | |
| "epoch": 1.0, | |
| "grad_norm": 27.442617416381836, | |
| "learning_rate": 6.333333333333332e-07, | |
| "loss": 1.6484, | |
| "mean_token_accuracy": 0.6565737128257751, | |
| "num_tokens": 5241019.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_entropy": 0.573745846748352, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.5927419066429138, | |
| "eval_num_tokens": 5241019.0, | |
| "eval_runtime": 0.5652, | |
| "eval_samples_per_second": 442.286, | |
| "eval_steps_per_second": 1.769, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.5741308927536011, | |
| "epoch": 1.05, | |
| "grad_norm": 26.800888061523438, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 1.5418, | |
| "mean_token_accuracy": 0.6670629978179932, | |
| "num_tokens": 5503030.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.5693758726119995, | |
| "epoch": 1.1, | |
| "grad_norm": 24.638330459594727, | |
| "learning_rate": 7e-07, | |
| "loss": 1.4704, | |
| "mean_token_accuracy": 0.6821052432060242, | |
| "num_tokens": 5765114.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.5774779915809631, | |
| "epoch": 1.15, | |
| "grad_norm": 23.92709732055664, | |
| "learning_rate": 7.333333333333332e-07, | |
| "loss": 1.3992, | |
| "mean_token_accuracy": 0.7002063989639282, | |
| "num_tokens": 6027183.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.5733228921890259, | |
| "epoch": 1.2, | |
| "grad_norm": 20.69150733947754, | |
| "learning_rate": 7.666666666666667e-07, | |
| "loss": 1.2931, | |
| "mean_token_accuracy": 0.732215166091919, | |
| "num_tokens": 6289274.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.5707331895828247, | |
| "epoch": 1.25, | |
| "grad_norm": 21.446800231933594, | |
| "learning_rate": 8e-07, | |
| "loss": 1.3543, | |
| "mean_token_accuracy": 0.7079277038574219, | |
| "num_tokens": 6551338.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.5755348801612854, | |
| "epoch": 1.3, | |
| "grad_norm": 22.206480026245117, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 1.3262, | |
| "mean_token_accuracy": 0.7222517132759094, | |
| "num_tokens": 6813352.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.5768681168556213, | |
| "epoch": 1.35, | |
| "grad_norm": 21.231828689575195, | |
| "learning_rate": 8.666666666666667e-07, | |
| "loss": 1.2112, | |
| "mean_token_accuracy": 0.7373400330543518, | |
| "num_tokens": 7075412.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.5692260265350342, | |
| "epoch": 1.4, | |
| "grad_norm": 22.956790924072266, | |
| "learning_rate": 9e-07, | |
| "loss": 1.3309, | |
| "mean_token_accuracy": 0.7283333539962769, | |
| "num_tokens": 7337436.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.5696606636047363, | |
| "epoch": 1.45, | |
| "grad_norm": 21.15884780883789, | |
| "learning_rate": 9.333333333333333e-07, | |
| "loss": 1.2323, | |
| "mean_token_accuracy": 0.7210144996643066, | |
| "num_tokens": 7599508.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.5705867409706116, | |
| "epoch": 1.5, | |
| "grad_norm": 21.3349609375, | |
| "learning_rate": 9.666666666666666e-07, | |
| "loss": 1.2339, | |
| "mean_token_accuracy": 0.7246376872062683, | |
| "num_tokens": 7861547.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5731691718101501, | |
| "epoch": 1.55, | |
| "grad_norm": 19.02399253845215, | |
| "learning_rate": 1e-06, | |
| "loss": 1.1907, | |
| "mean_token_accuracy": 0.724764883518219, | |
| "num_tokens": 8123603.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.575802206993103, | |
| "epoch": 1.6, | |
| "grad_norm": 13.962172508239746, | |
| "learning_rate": 1e-06, | |
| "loss": 0.9259, | |
| "mean_token_accuracy": 0.7593783736228943, | |
| "num_tokens": 8385666.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5742422342300415, | |
| "epoch": 1.65, | |
| "grad_norm": 13.67746353149414, | |
| "learning_rate": 1e-06, | |
| "loss": 0.8003, | |
| "mean_token_accuracy": 0.7683302760124207, | |
| "num_tokens": 8647717.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5761300325393677, | |
| "epoch": 1.7, | |
| "grad_norm": 13.221238136291504, | |
| "learning_rate": 1e-06, | |
| "loss": 0.8127, | |
| "mean_token_accuracy": 0.7722646594047546, | |
| "num_tokens": 8909783.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.5746512413024902, | |
| "epoch": 1.75, | |
| "grad_norm": 14.354029655456543, | |
| "learning_rate": 1e-06, | |
| "loss": 0.9135, | |
| "mean_token_accuracy": 0.7408514022827148, | |
| "num_tokens": 9171847.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.581115186214447, | |
| "epoch": 1.8, | |
| "grad_norm": 13.553462982177734, | |
| "learning_rate": 1e-06, | |
| "loss": 0.8726, | |
| "mean_token_accuracy": 0.7709611654281616, | |
| "num_tokens": 9433868.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5802749395370483, | |
| "epoch": 1.85, | |
| "grad_norm": 13.30045223236084, | |
| "learning_rate": 1e-06, | |
| "loss": 0.7273, | |
| "mean_token_accuracy": 0.7865416407585144, | |
| "num_tokens": 9695878.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5772420763969421, | |
| "epoch": 1.9, | |
| "grad_norm": 12.090519905090332, | |
| "learning_rate": 1e-06, | |
| "loss": 0.7367, | |
| "mean_token_accuracy": 0.8035824298858643, | |
| "num_tokens": 9957925.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5726953744888306, | |
| "epoch": 1.95, | |
| "grad_norm": 12.22325325012207, | |
| "learning_rate": 1e-06, | |
| "loss": 0.794, | |
| "mean_token_accuracy": 0.7968627214431763, | |
| "num_tokens": 10219991.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5777957439422607, | |
| "epoch": 2.0, | |
| "grad_norm": 11.300572395324707, | |
| "learning_rate": 1e-06, | |
| "loss": 0.6735, | |
| "mean_token_accuracy": 0.8091511130332947, | |
| "num_tokens": 10482014.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_entropy": 0.5787050724029541, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.7641128897666931, | |
| "eval_num_tokens": 10482014.0, | |
| "eval_runtime": 0.5646, | |
| "eval_samples_per_second": 442.773, | |
| "eval_steps_per_second": 1.771, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5709609985351562, | |
| "epoch": 2.05, | |
| "grad_norm": 9.544706344604492, | |
| "learning_rate": 1e-06, | |
| "loss": 0.5315, | |
| "mean_token_accuracy": 0.8500468730926514, | |
| "num_tokens": 10744103.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5764386653900146, | |
| "epoch": 2.1, | |
| "grad_norm": 10.447105407714844, | |
| "learning_rate": 1e-06, | |
| "loss": 0.7079, | |
| "mean_token_accuracy": 0.8055056929588318, | |
| "num_tokens": 11006169.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5761690139770508, | |
| "epoch": 2.15, | |
| "grad_norm": 8.850165367126465, | |
| "learning_rate": 1e-06, | |
| "loss": 0.5672, | |
| "mean_token_accuracy": 0.8491296172142029, | |
| "num_tokens": 11268181.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5757296085357666, | |
| "epoch": 2.2, | |
| "grad_norm": 7.652801036834717, | |
| "learning_rate": 1e-06, | |
| "loss": 0.5925, | |
| "mean_token_accuracy": 0.8589305877685547, | |
| "num_tokens": 11530235.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5732800960540771, | |
| "epoch": 2.25, | |
| "grad_norm": 8.43525505065918, | |
| "learning_rate": 1e-06, | |
| "loss": 0.5096, | |
| "mean_token_accuracy": 0.8493317365646362, | |
| "num_tokens": 11792276.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5726200342178345, | |
| "epoch": 2.3, | |
| "grad_norm": 6.224635601043701, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3717, | |
| "mean_token_accuracy": 0.8944099545478821, | |
| "num_tokens": 12054314.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5759143829345703, | |
| "epoch": 2.35, | |
| "grad_norm": 7.955408096313477, | |
| "learning_rate": 1e-06, | |
| "loss": 0.4937, | |
| "mean_token_accuracy": 0.8561508059501648, | |
| "num_tokens": 12316329.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5743073225021362, | |
| "epoch": 2.4, | |
| "grad_norm": 8.218153953552246, | |
| "learning_rate": 1e-06, | |
| "loss": 0.4752, | |
| "mean_token_accuracy": 0.8561111092567444, | |
| "num_tokens": 12578392.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5710327625274658, | |
| "epoch": 2.45, | |
| "grad_norm": 6.69417667388916, | |
| "learning_rate": 1e-06, | |
| "loss": 0.4361, | |
| "mean_token_accuracy": 0.8712534308433533, | |
| "num_tokens": 12840453.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5683552622795105, | |
| "epoch": 2.5, | |
| "grad_norm": 7.398046016693115, | |
| "learning_rate": 1e-06, | |
| "loss": 0.4676, | |
| "mean_token_accuracy": 0.8655256628990173, | |
| "num_tokens": 13102537.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5726050734519958, | |
| "epoch": 2.55, | |
| "grad_norm": 5.699220657348633, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3854, | |
| "mean_token_accuracy": 0.8902208209037781, | |
| "num_tokens": 13364583.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.5710792541503906, | |
| "epoch": 2.6, | |
| "grad_norm": 5.051173210144043, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3505, | |
| "mean_token_accuracy": 0.8793442845344543, | |
| "num_tokens": 13626635.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5732054710388184, | |
| "epoch": 2.65, | |
| "grad_norm": 4.918524265289307, | |
| "learning_rate": 1e-06, | |
| "loss": 0.351, | |
| "mean_token_accuracy": 0.87595534324646, | |
| "num_tokens": 13888665.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.5696585178375244, | |
| "epoch": 2.7, | |
| "grad_norm": 5.258333206176758, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3094, | |
| "mean_token_accuracy": 0.8884353637695312, | |
| "num_tokens": 14150728.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5700670480728149, | |
| "epoch": 2.75, | |
| "grad_norm": 5.786867618560791, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2947, | |
| "mean_token_accuracy": 0.8788819909095764, | |
| "num_tokens": 14412733.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5727378129959106, | |
| "epoch": 2.8, | |
| "grad_norm": 4.969060897827148, | |
| "learning_rate": 1e-06, | |
| "loss": 0.4303, | |
| "mean_token_accuracy": 0.8706739544868469, | |
| "num_tokens": 14674775.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5705511569976807, | |
| "epoch": 2.85, | |
| "grad_norm": 6.415738105773926, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2955, | |
| "mean_token_accuracy": 0.8876941204071045, | |
| "num_tokens": 14936877.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.572034478187561, | |
| "epoch": 2.9, | |
| "grad_norm": 5.3498029708862305, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2835, | |
| "mean_token_accuracy": 0.8952603936195374, | |
| "num_tokens": 15198915.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5720596313476562, | |
| "epoch": 2.95, | |
| "grad_norm": 3.9302492141723633, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3066, | |
| "mean_token_accuracy": 0.8984684944152832, | |
| "num_tokens": 15460974.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5642393231391907, | |
| "epoch": 3.0, | |
| "grad_norm": 3.9795563220977783, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2578, | |
| "mean_token_accuracy": 0.9144676923751831, | |
| "num_tokens": 15723062.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_entropy": 0.5693275332450867, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8682795763015747, | |
| "eval_num_tokens": 15723062.0, | |
| "eval_runtime": 0.5625, | |
| "eval_samples_per_second": 444.468, | |
| "eval_steps_per_second": 1.778, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5673961043357849, | |
| "epoch": 3.05, | |
| "grad_norm": 3.3862600326538086, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2377, | |
| "mean_token_accuracy": 0.9254385828971863, | |
| "num_tokens": 15985152.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5655107498168945, | |
| "epoch": 3.1, | |
| "grad_norm": 4.405587196350098, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3086, | |
| "mean_token_accuracy": 0.8858006000518799, | |
| "num_tokens": 16247235.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5651353597640991, | |
| "epoch": 3.15, | |
| "grad_norm": 3.514052391052246, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2306, | |
| "mean_token_accuracy": 0.9162125587463379, | |
| "num_tokens": 16509263.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5678315162658691, | |
| "epoch": 3.2, | |
| "grad_norm": 4.790067672729492, | |
| "learning_rate": 1e-06, | |
| "loss": 0.249, | |
| "mean_token_accuracy": 0.9055072665214539, | |
| "num_tokens": 16771251.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5601281523704529, | |
| "epoch": 3.25, | |
| "grad_norm": 4.446920394897461, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2501, | |
| "mean_token_accuracy": 0.9066374897956848, | |
| "num_tokens": 17033314.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5643904209136963, | |
| "epoch": 3.3, | |
| "grad_norm": 3.9066216945648193, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2619, | |
| "mean_token_accuracy": 0.9103641510009766, | |
| "num_tokens": 17295329.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.5618723630905151, | |
| "epoch": 3.35, | |
| "grad_norm": 3.168095588684082, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2269, | |
| "mean_token_accuracy": 0.9209107160568237, | |
| "num_tokens": 17557360.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5572832822799683, | |
| "epoch": 3.4, | |
| "grad_norm": 4.440161228179932, | |
| "learning_rate": 1e-06, | |
| "loss": 0.28, | |
| "mean_token_accuracy": 0.8996282815933228, | |
| "num_tokens": 17819430.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5604659914970398, | |
| "epoch": 3.45, | |
| "grad_norm": 3.969372510910034, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2705, | |
| "mean_token_accuracy": 0.9063779711723328, | |
| "num_tokens": 18081468.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5578862428665161, | |
| "epoch": 3.5, | |
| "grad_norm": 4.655684947967529, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2104, | |
| "mean_token_accuracy": 0.9167120456695557, | |
| "num_tokens": 18343568.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5561624765396118, | |
| "epoch": 3.55, | |
| "grad_norm": 4.448247909545898, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2552, | |
| "mean_token_accuracy": 0.8955500721931458, | |
| "num_tokens": 18605614.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5536712408065796, | |
| "epoch": 3.6, | |
| "grad_norm": 4.12972354888916, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2933, | |
| "mean_token_accuracy": 0.8912237286567688, | |
| "num_tokens": 18867660.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5511398315429688, | |
| "epoch": 3.65, | |
| "grad_norm": 4.112148284912109, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2228, | |
| "mean_token_accuracy": 0.9108073115348816, | |
| "num_tokens": 19129723.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5500166416168213, | |
| "epoch": 3.7, | |
| "grad_norm": 4.219006538391113, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2266, | |
| "mean_token_accuracy": 0.9114799499511719, | |
| "num_tokens": 19391795.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5509252548217773, | |
| "epoch": 3.75, | |
| "grad_norm": 5.647234916687012, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1964, | |
| "mean_token_accuracy": 0.9147146940231323, | |
| "num_tokens": 19653855.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.551358699798584, | |
| "epoch": 3.8, | |
| "grad_norm": 3.1081528663635254, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1849, | |
| "mean_token_accuracy": 0.9201655983924866, | |
| "num_tokens": 19915908.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5487557053565979, | |
| "epoch": 3.85, | |
| "grad_norm": 4.483115196228027, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2306, | |
| "mean_token_accuracy": 0.9138851761817932, | |
| "num_tokens": 20177933.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5503401160240173, | |
| "epoch": 3.9, | |
| "grad_norm": 3.959207534790039, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.9275280833244324, | |
| "num_tokens": 20439976.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.544685959815979, | |
| "epoch": 3.95, | |
| "grad_norm": 3.581266403198242, | |
| "learning_rate": 1e-06, | |
| "loss": 0.176, | |
| "mean_token_accuracy": 0.9270231127738953, | |
| "num_tokens": 20702019.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5437827110290527, | |
| "epoch": 4.0, | |
| "grad_norm": 5.461308002471924, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2157, | |
| "mean_token_accuracy": 0.9198629856109619, | |
| "num_tokens": 20964105.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_entropy": 0.5438513159751892, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8924731016159058, | |
| "eval_num_tokens": 20964105.0, | |
| "eval_runtime": 0.5684, | |
| "eval_samples_per_second": 439.865, | |
| "eval_steps_per_second": 1.759, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5403033494949341, | |
| "epoch": 4.05, | |
| "grad_norm": 4.5201497077941895, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2073, | |
| "mean_token_accuracy": 0.9200834035873413, | |
| "num_tokens": 21226170.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5385745763778687, | |
| "epoch": 4.1, | |
| "grad_norm": 3.7788710594177246, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2063, | |
| "mean_token_accuracy": 0.9138225317001343, | |
| "num_tokens": 21488232.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5423810482025146, | |
| "epoch": 4.15, | |
| "grad_norm": 3.069916248321533, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1746, | |
| "mean_token_accuracy": 0.9218472242355347, | |
| "num_tokens": 21750283.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5381914973258972, | |
| "epoch": 4.2, | |
| "grad_norm": 4.174190521240234, | |
| "learning_rate": 1e-06, | |
| "loss": 0.206, | |
| "mean_token_accuracy": 0.9193011522293091, | |
| "num_tokens": 22012342.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5410702228546143, | |
| "epoch": 4.25, | |
| "grad_norm": 4.735873222351074, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2291, | |
| "mean_token_accuracy": 0.9051008224487305, | |
| "num_tokens": 22274390.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5410845279693604, | |
| "epoch": 4.3, | |
| "grad_norm": 2.7942323684692383, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9240579605102539, | |
| "num_tokens": 22536477.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5370749831199646, | |
| "epoch": 4.35, | |
| "grad_norm": 4.275319576263428, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1859, | |
| "mean_token_accuracy": 0.9190635681152344, | |
| "num_tokens": 22798499.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5372530221939087, | |
| "epoch": 4.4, | |
| "grad_norm": 3.8254811763763428, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1581, | |
| "mean_token_accuracy": 0.9244146943092346, | |
| "num_tokens": 23060554.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5390424728393555, | |
| "epoch": 4.45, | |
| "grad_norm": 3.9321508407592773, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1901, | |
| "mean_token_accuracy": 0.9273531436920166, | |
| "num_tokens": 23322598.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5382624864578247, | |
| "epoch": 4.5, | |
| "grad_norm": 3.039321184158325, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1636, | |
| "mean_token_accuracy": 0.9295870065689087, | |
| "num_tokens": 23584635.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5385315418243408, | |
| "epoch": 4.55, | |
| "grad_norm": 3.339580774307251, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1749, | |
| "mean_token_accuracy": 0.9232121706008911, | |
| "num_tokens": 23846703.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5352039337158203, | |
| "epoch": 4.6, | |
| "grad_norm": 3.184174060821533, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9252479076385498, | |
| "num_tokens": 24108772.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5365791320800781, | |
| "epoch": 4.65, | |
| "grad_norm": 2.1336984634399414, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1654, | |
| "mean_token_accuracy": 0.9318456053733826, | |
| "num_tokens": 24370792.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.535903811454773, | |
| "epoch": 4.7, | |
| "grad_norm": 3.2332236766815186, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1706, | |
| "mean_token_accuracy": 0.9316656589508057, | |
| "num_tokens": 24632826.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5376459360122681, | |
| "epoch": 4.75, | |
| "grad_norm": 4.174566268920898, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1938, | |
| "mean_token_accuracy": 0.9220023155212402, | |
| "num_tokens": 24894906.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5379908084869385, | |
| "epoch": 4.8, | |
| "grad_norm": 2.7120425701141357, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1818, | |
| "mean_token_accuracy": 0.9200000166893005, | |
| "num_tokens": 25156928.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.5380344986915588, | |
| "epoch": 4.85, | |
| "grad_norm": 3.080734968185425, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1615, | |
| "mean_token_accuracy": 0.934725821018219, | |
| "num_tokens": 25419007.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5391592979431152, | |
| "epoch": 4.9, | |
| "grad_norm": 2.436408519744873, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1602, | |
| "mean_token_accuracy": 0.9324124455451965, | |
| "num_tokens": 25681033.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5356138348579407, | |
| "epoch": 4.95, | |
| "grad_norm": 3.9103612899780273, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1553, | |
| "mean_token_accuracy": 0.9342178106307983, | |
| "num_tokens": 25943105.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5300828218460083, | |
| "epoch": 5.0, | |
| "grad_norm": 3.9791324138641357, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1867, | |
| "mean_token_accuracy": 0.9147771596908569, | |
| "num_tokens": 26205142.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_entropy": 0.532131552696228, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8958333134651184, | |
| "eval_num_tokens": 26205142.0, | |
| "eval_runtime": 0.5658, | |
| "eval_samples_per_second": 441.867, | |
| "eval_steps_per_second": 1.767, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5331390500068665, | |
| "epoch": 5.05, | |
| "grad_norm": 3.375486373901367, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1651, | |
| "mean_token_accuracy": 0.9255132079124451, | |
| "num_tokens": 26467176.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5314509868621826, | |
| "epoch": 5.1, | |
| "grad_norm": 3.910857915878296, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1619, | |
| "mean_token_accuracy": 0.9261786341667175, | |
| "num_tokens": 26729216.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5281578302383423, | |
| "epoch": 5.15, | |
| "grad_norm": 2.9953229427337646, | |
| "learning_rate": 1e-06, | |
| "loss": 0.169, | |
| "mean_token_accuracy": 0.935003399848938, | |
| "num_tokens": 26991253.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.5287604331970215, | |
| "epoch": 5.2, | |
| "grad_norm": 2.6056575775146484, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1483, | |
| "mean_token_accuracy": 0.9292088150978088, | |
| "num_tokens": 27253296.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5280485153198242, | |
| "epoch": 5.25, | |
| "grad_norm": 3.1244957447052, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1576, | |
| "mean_token_accuracy": 0.9250302314758301, | |
| "num_tokens": 27515312.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5249028205871582, | |
| "epoch": 5.3, | |
| "grad_norm": 3.752169370651245, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9309778213500977, | |
| "num_tokens": 27777353.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5274486541748047, | |
| "epoch": 5.35, | |
| "grad_norm": 2.915797233581543, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1627, | |
| "mean_token_accuracy": 0.9288889169692993, | |
| "num_tokens": 28039383.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5247786641120911, | |
| "epoch": 5.4, | |
| "grad_norm": 3.0220959186553955, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9305993914604187, | |
| "num_tokens": 28301429.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5274587273597717, | |
| "epoch": 5.45, | |
| "grad_norm": 3.330185651779175, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1402, | |
| "mean_token_accuracy": 0.9378365278244019, | |
| "num_tokens": 28563471.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.52198326587677, | |
| "epoch": 5.5, | |
| "grad_norm": 3.4707701206207275, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1497, | |
| "mean_token_accuracy": 0.933920681476593, | |
| "num_tokens": 28825554.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5196748971939087, | |
| "epoch": 5.55, | |
| "grad_norm": 3.8048267364501953, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1682, | |
| "mean_token_accuracy": 0.9194383025169373, | |
| "num_tokens": 29087632.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.5231800079345703, | |
| "epoch": 5.6, | |
| "grad_norm": 2.951167106628418, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1463, | |
| "mean_token_accuracy": 0.9328449368476868, | |
| "num_tokens": 29349665.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5201854705810547, | |
| "epoch": 5.65, | |
| "grad_norm": 3.4071881771087646, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1438, | |
| "mean_token_accuracy": 0.9292557239532471, | |
| "num_tokens": 29611714.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5249330997467041, | |
| "epoch": 5.7, | |
| "grad_norm": 2.2466695308685303, | |
| "learning_rate": 1e-06, | |
| "loss": 0.158, | |
| "mean_token_accuracy": 0.93149334192276, | |
| "num_tokens": 29873775.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.5218572616577148, | |
| "epoch": 5.75, | |
| "grad_norm": 2.9838244915008545, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1371, | |
| "mean_token_accuracy": 0.9416499137878418, | |
| "num_tokens": 30135859.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5222463607788086, | |
| "epoch": 5.8, | |
| "grad_norm": 3.629559278488159, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1824, | |
| "mean_token_accuracy": 0.9253350496292114, | |
| "num_tokens": 30397920.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5209097862243652, | |
| "epoch": 5.85, | |
| "grad_norm": 5.2597174644470215, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1736, | |
| "mean_token_accuracy": 0.9180887341499329, | |
| "num_tokens": 30659978.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5217398405075073, | |
| "epoch": 5.9, | |
| "grad_norm": 2.7834465503692627, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1539, | |
| "mean_token_accuracy": 0.9320943355560303, | |
| "num_tokens": 30922069.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5216600894927979, | |
| "epoch": 5.95, | |
| "grad_norm": 3.517230749130249, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1599, | |
| "mean_token_accuracy": 0.9316734075546265, | |
| "num_tokens": 31184125.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5233047604560852, | |
| "epoch": 6.0, | |
| "grad_norm": 4.519037246704102, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1686, | |
| "mean_token_accuracy": 0.9267241358757019, | |
| "num_tokens": 31446144.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_entropy": 0.5248987674713135, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8958333134651184, | |
| "eval_num_tokens": 31446144.0, | |
| "eval_runtime": 0.5646, | |
| "eval_samples_per_second": 442.826, | |
| "eval_steps_per_second": 1.771, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5230313539505005, | |
| "epoch": 6.05, | |
| "grad_norm": 3.3974978923797607, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1382, | |
| "mean_token_accuracy": 0.9349269866943359, | |
| "num_tokens": 31708177.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5233445167541504, | |
| "epoch": 6.1, | |
| "grad_norm": 3.316178321838379, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1417, | |
| "mean_token_accuracy": 0.9336429834365845, | |
| "num_tokens": 31970211.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.519654393196106, | |
| "epoch": 6.15, | |
| "grad_norm": 4.040668487548828, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1715, | |
| "mean_token_accuracy": 0.9241849780082703, | |
| "num_tokens": 32232255.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5215832591056824, | |
| "epoch": 6.2, | |
| "grad_norm": 4.117729663848877, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1637, | |
| "mean_token_accuracy": 0.9260615110397339, | |
| "num_tokens": 32494346.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5216290950775146, | |
| "epoch": 6.25, | |
| "grad_norm": 3.0051236152648926, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1519, | |
| "mean_token_accuracy": 0.9318435788154602, | |
| "num_tokens": 32756399.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5209139585494995, | |
| "epoch": 6.3, | |
| "grad_norm": 2.9851608276367188, | |
| "learning_rate": 1e-06, | |
| "loss": 0.133, | |
| "mean_token_accuracy": 0.9429529905319214, | |
| "num_tokens": 33018449.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.5213064551353455, | |
| "epoch": 6.35, | |
| "grad_norm": 3.356963872909546, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1629, | |
| "mean_token_accuracy": 0.9313392043113708, | |
| "num_tokens": 33280513.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.5207207202911377, | |
| "epoch": 6.4, | |
| "grad_norm": 3.217970132827759, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1518, | |
| "mean_token_accuracy": 0.9335684776306152, | |
| "num_tokens": 33542576.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5207620859146118, | |
| "epoch": 6.45, | |
| "grad_norm": 4.5813703536987305, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9296131730079651, | |
| "num_tokens": 33804614.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5200029611587524, | |
| "epoch": 6.5, | |
| "grad_norm": 2.659916400909424, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1566, | |
| "mean_token_accuracy": 0.9301252365112305, | |
| "num_tokens": 34066691.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5188630819320679, | |
| "epoch": 6.55, | |
| "grad_norm": 3.103395700454712, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1655, | |
| "mean_token_accuracy": 0.9257642030715942, | |
| "num_tokens": 34328757.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5178630948066711, | |
| "epoch": 6.6, | |
| "grad_norm": 3.037834644317627, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1467, | |
| "mean_token_accuracy": 0.9354194402694702, | |
| "num_tokens": 34590819.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5198030471801758, | |
| "epoch": 6.65, | |
| "grad_norm": 2.739222526550293, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1211, | |
| "mean_token_accuracy": 0.9462962746620178, | |
| "num_tokens": 34852867.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5187867879867554, | |
| "epoch": 6.7, | |
| "grad_norm": 3.5631425380706787, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.9339437484741211, | |
| "num_tokens": 35114923.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5223994851112366, | |
| "epoch": 6.75, | |
| "grad_norm": 3.349653482437134, | |
| "learning_rate": 1e-06, | |
| "loss": 0.15, | |
| "mean_token_accuracy": 0.9327133297920227, | |
| "num_tokens": 35376948.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5207353830337524, | |
| "epoch": 6.8, | |
| "grad_norm": 3.7862677574157715, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1588, | |
| "mean_token_accuracy": 0.9323570728302002, | |
| "num_tokens": 35639008.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.522723376750946, | |
| "epoch": 6.85, | |
| "grad_norm": 3.227595090866089, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1556, | |
| "mean_token_accuracy": 0.9338235259056091, | |
| "num_tokens": 35901080.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5194015502929688, | |
| "epoch": 6.9, | |
| "grad_norm": 3.0805652141571045, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1246, | |
| "mean_token_accuracy": 0.9439759254455566, | |
| "num_tokens": 36163102.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5224202871322632, | |
| "epoch": 6.95, | |
| "grad_norm": 2.6702420711517334, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1331, | |
| "mean_token_accuracy": 0.942307710647583, | |
| "num_tokens": 36425129.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5183343887329102, | |
| "epoch": 7.0, | |
| "grad_norm": 3.2100484371185303, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1427, | |
| "mean_token_accuracy": 0.9357267022132874, | |
| "num_tokens": 36687152.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_entropy": 0.5203882455825806, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9045698642730713, | |
| "eval_num_tokens": 36687152.0, | |
| "eval_runtime": 0.5662, | |
| "eval_samples_per_second": 441.545, | |
| "eval_steps_per_second": 1.766, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5172215104103088, | |
| "epoch": 7.05, | |
| "grad_norm": 2.8229575157165527, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1359, | |
| "mean_token_accuracy": 0.9341161251068115, | |
| "num_tokens": 36949179.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5188791751861572, | |
| "epoch": 7.1, | |
| "grad_norm": 3.300265312194824, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1479, | |
| "mean_token_accuracy": 0.9282218813896179, | |
| "num_tokens": 37211215.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5181975364685059, | |
| "epoch": 7.15, | |
| "grad_norm": 3.0605766773223877, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9403209686279297, | |
| "num_tokens": 37473208.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5172451734542847, | |
| "epoch": 7.2, | |
| "grad_norm": 3.0566470623016357, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1371, | |
| "mean_token_accuracy": 0.9368836283683777, | |
| "num_tokens": 37735223.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.5153446197509766, | |
| "epoch": 7.25, | |
| "grad_norm": 3.771998167037964, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1531, | |
| "mean_token_accuracy": 0.9337349534034729, | |
| "num_tokens": 37997309.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5160014033317566, | |
| "epoch": 7.3, | |
| "grad_norm": 3.9155826568603516, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1457, | |
| "mean_token_accuracy": 0.9334239363670349, | |
| "num_tokens": 38259341.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5150690078735352, | |
| "epoch": 7.35, | |
| "grad_norm": 3.842313766479492, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1434, | |
| "mean_token_accuracy": 0.9330238699913025, | |
| "num_tokens": 38521409.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5158810615539551, | |
| "epoch": 7.4, | |
| "grad_norm": 3.2817740440368652, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1542, | |
| "mean_token_accuracy": 0.9296690225601196, | |
| "num_tokens": 38783430.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5140302777290344, | |
| "epoch": 7.45, | |
| "grad_norm": 3.40156626701355, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1385, | |
| "mean_token_accuracy": 0.9309021234512329, | |
| "num_tokens": 39045487.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5199052691459656, | |
| "epoch": 7.5, | |
| "grad_norm": 3.458606004714966, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1402, | |
| "mean_token_accuracy": 0.9361202716827393, | |
| "num_tokens": 39307549.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5160842537879944, | |
| "epoch": 7.55, | |
| "grad_norm": 2.932157278060913, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1373, | |
| "mean_token_accuracy": 0.9383945465087891, | |
| "num_tokens": 39569617.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5171875953674316, | |
| "epoch": 7.6, | |
| "grad_norm": 3.465000867843628, | |
| "learning_rate": 1e-06, | |
| "loss": 0.174, | |
| "mean_token_accuracy": 0.9331210255622864, | |
| "num_tokens": 39831648.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5146853923797607, | |
| "epoch": 7.65, | |
| "grad_norm": 5.0309343338012695, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.9341809749603271, | |
| "num_tokens": 40093743.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5160114765167236, | |
| "epoch": 7.7, | |
| "grad_norm": 4.118295192718506, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1289, | |
| "mean_token_accuracy": 0.9425212144851685, | |
| "num_tokens": 40355801.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5127236843109131, | |
| "epoch": 7.75, | |
| "grad_norm": 3.2528462409973145, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1355, | |
| "mean_token_accuracy": 0.94050532579422, | |
| "num_tokens": 40617885.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5161481499671936, | |
| "epoch": 7.8, | |
| "grad_norm": 3.1190099716186523, | |
| "learning_rate": 1e-06, | |
| "loss": 0.16, | |
| "mean_token_accuracy": 0.9346323013305664, | |
| "num_tokens": 40879937.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5165537595748901, | |
| "epoch": 7.85, | |
| "grad_norm": 2.945587635040283, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1374, | |
| "mean_token_accuracy": 0.9363411664962769, | |
| "num_tokens": 41141983.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.5178842544555664, | |
| "epoch": 7.9, | |
| "grad_norm": 2.951826572418213, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1364, | |
| "mean_token_accuracy": 0.9384886026382446, | |
| "num_tokens": 41404052.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5159619450569153, | |
| "epoch": 7.95, | |
| "grad_norm": 4.019174575805664, | |
| "learning_rate": 1e-06, | |
| "loss": 0.132, | |
| "mean_token_accuracy": 0.9305019378662109, | |
| "num_tokens": 41666100.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5143953561782837, | |
| "epoch": 8.0, | |
| "grad_norm": 4.0759196281433105, | |
| "learning_rate": 1e-06, | |
| "loss": 0.143, | |
| "mean_token_accuracy": 0.9287616610527039, | |
| "num_tokens": 41928162.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_entropy": 0.5199548602104187, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8958333134651184, | |
| "eval_num_tokens": 41928162.0, | |
| "eval_runtime": 0.5663, | |
| "eval_samples_per_second": 441.44, | |
| "eval_steps_per_second": 1.766, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5157938003540039, | |
| "epoch": 8.05, | |
| "grad_norm": 3.2986180782318115, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1428, | |
| "mean_token_accuracy": 0.9472459554672241, | |
| "num_tokens": 42190242.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5174839496612549, | |
| "epoch": 8.1, | |
| "grad_norm": 2.7032060623168945, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1349, | |
| "mean_token_accuracy": 0.9354838728904724, | |
| "num_tokens": 42452270.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5150723457336426, | |
| "epoch": 8.15, | |
| "grad_norm": 3.3034956455230713, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1562, | |
| "mean_token_accuracy": 0.9342105388641357, | |
| "num_tokens": 42714340.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5165647268295288, | |
| "epoch": 8.2, | |
| "grad_norm": 3.147430181503296, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1267, | |
| "mean_token_accuracy": 0.9487179517745972, | |
| "num_tokens": 42976388.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5171810388565063, | |
| "epoch": 8.25, | |
| "grad_norm": 2.788745164871216, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1383, | |
| "mean_token_accuracy": 0.937644362449646, | |
| "num_tokens": 43238449.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5187046527862549, | |
| "epoch": 8.3, | |
| "grad_norm": 3.536580801010132, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1401, | |
| "mean_token_accuracy": 0.9310526251792908, | |
| "num_tokens": 43500447.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5151098370552063, | |
| "epoch": 8.35, | |
| "grad_norm": 3.484966516494751, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1584, | |
| "mean_token_accuracy": 0.9285714030265808, | |
| "num_tokens": 43762496.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.5161045789718628, | |
| "epoch": 8.4, | |
| "grad_norm": 2.554356813430786, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1444, | |
| "mean_token_accuracy": 0.9319999814033508, | |
| "num_tokens": 44024542.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5149120688438416, | |
| "epoch": 8.45, | |
| "grad_norm": 4.06463623046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1288, | |
| "mean_token_accuracy": 0.9389110207557678, | |
| "num_tokens": 44286608.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5138819813728333, | |
| "epoch": 8.5, | |
| "grad_norm": 4.850083827972412, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9411404132843018, | |
| "num_tokens": 44548667.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.513306736946106, | |
| "epoch": 8.55, | |
| "grad_norm": 2.4267070293426514, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1296, | |
| "mean_token_accuracy": 0.9418439865112305, | |
| "num_tokens": 44810703.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5128031969070435, | |
| "epoch": 8.6, | |
| "grad_norm": 3.5913071632385254, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1225, | |
| "mean_token_accuracy": 0.9422430992126465, | |
| "num_tokens": 45072719.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5119505524635315, | |
| "epoch": 8.65, | |
| "grad_norm": 3.707689046859741, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1477, | |
| "mean_token_accuracy": 0.9355555772781372, | |
| "num_tokens": 45334794.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5131097435951233, | |
| "epoch": 8.7, | |
| "grad_norm": 4.792629241943359, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1563, | |
| "mean_token_accuracy": 0.9289617538452148, | |
| "num_tokens": 45596869.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.51198810338974, | |
| "epoch": 8.75, | |
| "grad_norm": 2.6373438835144043, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9399141669273376, | |
| "num_tokens": 45858928.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5113101005554199, | |
| "epoch": 8.8, | |
| "grad_norm": 2.828310966491699, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1292, | |
| "mean_token_accuracy": 0.944847583770752, | |
| "num_tokens": 46120998.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.514806866645813, | |
| "epoch": 8.85, | |
| "grad_norm": 3.7976365089416504, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1378, | |
| "mean_token_accuracy": 0.9391401410102844, | |
| "num_tokens": 46383019.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5154971480369568, | |
| "epoch": 8.9, | |
| "grad_norm": 3.059340000152588, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9379541873931885, | |
| "num_tokens": 46645071.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.5132753849029541, | |
| "epoch": 8.95, | |
| "grad_norm": 2.7030842304229736, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1241, | |
| "mean_token_accuracy": 0.9378563165664673, | |
| "num_tokens": 46907121.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5132700204849243, | |
| "epoch": 9.0, | |
| "grad_norm": 3.4913828372955322, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1243, | |
| "mean_token_accuracy": 0.9455108642578125, | |
| "num_tokens": 47169197.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_entropy": 0.5133532285690308, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9072580933570862, | |
| "eval_num_tokens": 47169197.0, | |
| "eval_runtime": 0.5634, | |
| "eval_samples_per_second": 443.762, | |
| "eval_steps_per_second": 1.775, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.5132405757904053, | |
| "epoch": 9.05, | |
| "grad_norm": 2.907648801803589, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9371069073677063, | |
| "num_tokens": 47431209.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5094999670982361, | |
| "epoch": 9.1, | |
| "grad_norm": 3.242464303970337, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1316, | |
| "mean_token_accuracy": 0.9456915259361267, | |
| "num_tokens": 47693282.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5093837380409241, | |
| "epoch": 9.15, | |
| "grad_norm": 3.4022397994995117, | |
| "learning_rate": 1e-06, | |
| "loss": 0.119, | |
| "mean_token_accuracy": 0.9439567923545837, | |
| "num_tokens": 47955290.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5123763680458069, | |
| "epoch": 9.2, | |
| "grad_norm": 3.2485334873199463, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1219, | |
| "mean_token_accuracy": 0.9474899172782898, | |
| "num_tokens": 48217352.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5091462135314941, | |
| "epoch": 9.25, | |
| "grad_norm": 2.531839370727539, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1277, | |
| "mean_token_accuracy": 0.9405654668807983, | |
| "num_tokens": 48479414.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5079025030136108, | |
| "epoch": 9.3, | |
| "grad_norm": 4.208319187164307, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1655, | |
| "mean_token_accuracy": 0.9309878349304199, | |
| "num_tokens": 48741485.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5062220096588135, | |
| "epoch": 9.35, | |
| "grad_norm": 4.336572647094727, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1605, | |
| "mean_token_accuracy": 0.9226190447807312, | |
| "num_tokens": 49003521.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5075182914733887, | |
| "epoch": 9.4, | |
| "grad_norm": 3.8903305530548096, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1279, | |
| "mean_token_accuracy": 0.9393737316131592, | |
| "num_tokens": 49265549.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5041744709014893, | |
| "epoch": 9.45, | |
| "grad_norm": 4.592701435089111, | |
| "learning_rate": 1e-06, | |
| "loss": 0.146, | |
| "mean_token_accuracy": 0.9337060451507568, | |
| "num_tokens": 49527625.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5066587924957275, | |
| "epoch": 9.5, | |
| "grad_norm": 4.691225528717041, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1354, | |
| "mean_token_accuracy": 0.9368420839309692, | |
| "num_tokens": 49789713.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5104098320007324, | |
| "epoch": 9.55, | |
| "grad_norm": 2.6505699157714844, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1374, | |
| "mean_token_accuracy": 0.942954957485199, | |
| "num_tokens": 50051762.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5087345242500305, | |
| "epoch": 9.6, | |
| "grad_norm": 3.0128960609436035, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1244, | |
| "mean_token_accuracy": 0.9477911591529846, | |
| "num_tokens": 50313783.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.5104490518569946, | |
| "epoch": 9.65, | |
| "grad_norm": 2.859647035598755, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1296, | |
| "mean_token_accuracy": 0.9420821070671082, | |
| "num_tokens": 50575839.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5095815062522888, | |
| "epoch": 9.7, | |
| "grad_norm": 3.4269556999206543, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1245, | |
| "mean_token_accuracy": 0.9457672238349915, | |
| "num_tokens": 50837911.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5142146348953247, | |
| "epoch": 9.75, | |
| "grad_norm": 2.8217012882232666, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1231, | |
| "mean_token_accuracy": 0.9454896450042725, | |
| "num_tokens": 51099983.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5132467150688171, | |
| "epoch": 9.8, | |
| "grad_norm": 3.072129964828491, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1275, | |
| "mean_token_accuracy": 0.9424341917037964, | |
| "num_tokens": 51362037.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5132461786270142, | |
| "epoch": 9.85, | |
| "grad_norm": 4.272913932800293, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1399, | |
| "mean_token_accuracy": 0.9311075806617737, | |
| "num_tokens": 51624088.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5129303932189941, | |
| "epoch": 9.9, | |
| "grad_norm": 4.9169230461120605, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1576, | |
| "mean_token_accuracy": 0.9312573671340942, | |
| "num_tokens": 51886119.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5136593580245972, | |
| "epoch": 9.95, | |
| "grad_norm": 2.8221092224121094, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1408, | |
| "mean_token_accuracy": 0.9360189437866211, | |
| "num_tokens": 52148169.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.513308048248291, | |
| "epoch": 10.0, | |
| "grad_norm": 2.4588990211486816, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1364, | |
| "mean_token_accuracy": 0.9390919208526611, | |
| "num_tokens": 52410205.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_entropy": 0.5121233463287354, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8965053558349609, | |
| "eval_num_tokens": 52410205.0, | |
| "eval_runtime": 0.5632, | |
| "eval_samples_per_second": 443.897, | |
| "eval_steps_per_second": 1.776, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5156605243682861, | |
| "epoch": 10.05, | |
| "grad_norm": 3.8599801063537598, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1296, | |
| "mean_token_accuracy": 0.9414870142936707, | |
| "num_tokens": 52672213.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5084142684936523, | |
| "epoch": 10.1, | |
| "grad_norm": 3.849475860595703, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1361, | |
| "mean_token_accuracy": 0.9329004287719727, | |
| "num_tokens": 52934291.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5113674998283386, | |
| "epoch": 10.15, | |
| "grad_norm": 3.281127691268921, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1253, | |
| "mean_token_accuracy": 0.9463624954223633, | |
| "num_tokens": 53196341.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5095717906951904, | |
| "epoch": 10.2, | |
| "grad_norm": 3.2623631954193115, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9394292235374451, | |
| "num_tokens": 53458420.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5095763206481934, | |
| "epoch": 10.25, | |
| "grad_norm": 3.0780463218688965, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1136, | |
| "mean_token_accuracy": 0.9502018690109253, | |
| "num_tokens": 53720433.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5088077783584595, | |
| "epoch": 10.3, | |
| "grad_norm": 3.142488479614258, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1411, | |
| "mean_token_accuracy": 0.9367321729660034, | |
| "num_tokens": 53982489.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.5087566375732422, | |
| "epoch": 10.35, | |
| "grad_norm": 3.4320948123931885, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1141, | |
| "mean_token_accuracy": 0.9442488551139832, | |
| "num_tokens": 54244555.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5068839192390442, | |
| "epoch": 10.4, | |
| "grad_norm": 4.642038345336914, | |
| "learning_rate": 1e-06, | |
| "loss": 0.168, | |
| "mean_token_accuracy": 0.9365397691726685, | |
| "num_tokens": 54506612.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5061399936676025, | |
| "epoch": 10.45, | |
| "grad_norm": 4.175653457641602, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1312, | |
| "mean_token_accuracy": 0.9389256834983826, | |
| "num_tokens": 54768696.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5083756446838379, | |
| "epoch": 10.5, | |
| "grad_norm": 3.5277068614959717, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1301, | |
| "mean_token_accuracy": 0.9352391958236694, | |
| "num_tokens": 55030739.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.505259096622467, | |
| "epoch": 10.55, | |
| "grad_norm": 4.416886806488037, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1353, | |
| "mean_token_accuracy": 0.9358885288238525, | |
| "num_tokens": 55292767.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5073595643043518, | |
| "epoch": 10.6, | |
| "grad_norm": 2.7743871212005615, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1397, | |
| "mean_token_accuracy": 0.9359895586967468, | |
| "num_tokens": 55554825.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5083353519439697, | |
| "epoch": 10.65, | |
| "grad_norm": 2.932196617126465, | |
| "learning_rate": 1e-06, | |
| "loss": 0.122, | |
| "mean_token_accuracy": 0.9455605745315552, | |
| "num_tokens": 55816862.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5083277821540833, | |
| "epoch": 10.7, | |
| "grad_norm": 3.528801441192627, | |
| "learning_rate": 1e-06, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9445459842681885, | |
| "num_tokens": 56078931.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5057054162025452, | |
| "epoch": 10.75, | |
| "grad_norm": 4.0908589363098145, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1257, | |
| "mean_token_accuracy": 0.9417199492454529, | |
| "num_tokens": 56340964.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5087761878967285, | |
| "epoch": 10.8, | |
| "grad_norm": 3.4696297645568848, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1151, | |
| "mean_token_accuracy": 0.9496581554412842, | |
| "num_tokens": 56603001.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5087063908576965, | |
| "epoch": 10.85, | |
| "grad_norm": 3.221892833709717, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1176, | |
| "mean_token_accuracy": 0.9435195922851562, | |
| "num_tokens": 56865045.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5074091553688049, | |
| "epoch": 10.9, | |
| "grad_norm": 4.037084102630615, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1468, | |
| "mean_token_accuracy": 0.9345403909683228, | |
| "num_tokens": 57127107.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.510859489440918, | |
| "epoch": 10.95, | |
| "grad_norm": 3.951176166534424, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1251, | |
| "mean_token_accuracy": 0.9449082016944885, | |
| "num_tokens": 57389167.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5087305903434753, | |
| "epoch": 11.0, | |
| "grad_norm": 3.742441177368164, | |
| "learning_rate": 1e-06, | |
| "loss": 0.127, | |
| "mean_token_accuracy": 0.945555567741394, | |
| "num_tokens": 57651197.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_entropy": 0.5088062286376953, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9099462628364563, | |
| "eval_num_tokens": 57651197.0, | |
| "eval_runtime": 0.5668, | |
| "eval_samples_per_second": 441.055, | |
| "eval_steps_per_second": 1.764, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5034325122833252, | |
| "epoch": 11.05, | |
| "grad_norm": 5.275214672088623, | |
| "learning_rate": 1e-06, | |
| "loss": 0.15, | |
| "mean_token_accuracy": 0.9286394119262695, | |
| "num_tokens": 57913303.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.505387008190155, | |
| "epoch": 11.1, | |
| "grad_norm": 3.4117355346679688, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1238, | |
| "mean_token_accuracy": 0.9479674696922302, | |
| "num_tokens": 58175357.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5113849639892578, | |
| "epoch": 11.15, | |
| "grad_norm": 2.8327202796936035, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1258, | |
| "mean_token_accuracy": 0.9426156878471375, | |
| "num_tokens": 58437406.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5018768310546875, | |
| "epoch": 11.2, | |
| "grad_norm": 3.272310972213745, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1094, | |
| "mean_token_accuracy": 0.9515201449394226, | |
| "num_tokens": 58699480.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.505340576171875, | |
| "epoch": 11.25, | |
| "grad_norm": 2.4740121364593506, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1209, | |
| "mean_token_accuracy": 0.9433842301368713, | |
| "num_tokens": 58961546.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.506737232208252, | |
| "epoch": 11.3, | |
| "grad_norm": 3.198965311050415, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1236, | |
| "mean_token_accuracy": 0.9417647123336792, | |
| "num_tokens": 59223575.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5045244693756104, | |
| "epoch": 11.35, | |
| "grad_norm": 3.001002550125122, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1265, | |
| "mean_token_accuracy": 0.9461426734924316, | |
| "num_tokens": 59485641.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.5068516135215759, | |
| "epoch": 11.4, | |
| "grad_norm": 3.9516587257385254, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1325, | |
| "mean_token_accuracy": 0.9444125890731812, | |
| "num_tokens": 59747649.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5061191916465759, | |
| "epoch": 11.45, | |
| "grad_norm": 3.9736275672912598, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1202, | |
| "mean_token_accuracy": 0.9477000832557678, | |
| "num_tokens": 60009697.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5050452947616577, | |
| "epoch": 11.5, | |
| "grad_norm": 3.388237714767456, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1113, | |
| "mean_token_accuracy": 0.9482221007347107, | |
| "num_tokens": 60271794.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5037835836410522, | |
| "epoch": 11.55, | |
| "grad_norm": 4.176617622375488, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1532, | |
| "mean_token_accuracy": 0.933163583278656, | |
| "num_tokens": 60533859.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.504481852054596, | |
| "epoch": 11.6, | |
| "grad_norm": 4.4760212898254395, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1466, | |
| "mean_token_accuracy": 0.9319999814033508, | |
| "num_tokens": 60795919.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.5054460763931274, | |
| "epoch": 11.65, | |
| "grad_norm": 2.788715362548828, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1123, | |
| "mean_token_accuracy": 0.9475218653678894, | |
| "num_tokens": 61057963.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5077073574066162, | |
| "epoch": 11.7, | |
| "grad_norm": 2.838501214981079, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1288, | |
| "mean_token_accuracy": 0.9445010423660278, | |
| "num_tokens": 61319959.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5062661170959473, | |
| "epoch": 11.75, | |
| "grad_norm": 3.208291530609131, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1303, | |
| "mean_token_accuracy": 0.9417750239372253, | |
| "num_tokens": 61581991.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5048550963401794, | |
| "epoch": 11.8, | |
| "grad_norm": 3.915485382080078, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1204, | |
| "mean_token_accuracy": 0.9506539702415466, | |
| "num_tokens": 61843969.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5031319856643677, | |
| "epoch": 11.85, | |
| "grad_norm": 3.0714540481567383, | |
| "learning_rate": 1e-06, | |
| "loss": 0.116, | |
| "mean_token_accuracy": 0.9487970471382141, | |
| "num_tokens": 62106051.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5029826164245605, | |
| "epoch": 11.9, | |
| "grad_norm": 3.172436475753784, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1148, | |
| "mean_token_accuracy": 0.9473684430122375, | |
| "num_tokens": 62368113.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5026971101760864, | |
| "epoch": 11.95, | |
| "grad_norm": 3.787898540496826, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1188, | |
| "mean_token_accuracy": 0.9513981342315674, | |
| "num_tokens": 62630142.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.501825213432312, | |
| "epoch": 12.0, | |
| "grad_norm": 3.851665735244751, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1269, | |
| "mean_token_accuracy": 0.9424046277999878, | |
| "num_tokens": 62892256.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_entropy": 0.5031265020370483, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9032257795333862, | |
| "eval_num_tokens": 62892256.0, | |
| "eval_runtime": 0.5654, | |
| "eval_samples_per_second": 442.144, | |
| "eval_steps_per_second": 1.769, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5012459754943848, | |
| "epoch": 12.05, | |
| "grad_norm": 4.037276744842529, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9416014552116394, | |
| "num_tokens": 63154312.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5014461278915405, | |
| "epoch": 12.1, | |
| "grad_norm": 2.862247943878174, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1123, | |
| "mean_token_accuracy": 0.9479434490203857, | |
| "num_tokens": 63416362.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.5007213354110718, | |
| "epoch": 12.15, | |
| "grad_norm": 2.3149445056915283, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1107, | |
| "mean_token_accuracy": 0.9534450769424438, | |
| "num_tokens": 63678434.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5017549991607666, | |
| "epoch": 12.2, | |
| "grad_norm": 3.403278350830078, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1306, | |
| "mean_token_accuracy": 0.941082775592804, | |
| "num_tokens": 63940449.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.4991673529148102, | |
| "epoch": 12.25, | |
| "grad_norm": 3.251974105834961, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1095, | |
| "mean_token_accuracy": 0.9473365545272827, | |
| "num_tokens": 64202496.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.4982728958129883, | |
| "epoch": 12.3, | |
| "grad_norm": 3.218226909637451, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1216, | |
| "mean_token_accuracy": 0.9382879734039307, | |
| "num_tokens": 64464563.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.4956706464290619, | |
| "epoch": 12.35, | |
| "grad_norm": 3.3675098419189453, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1157, | |
| "mean_token_accuracy": 0.9458943605422974, | |
| "num_tokens": 64726628.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.4967312514781952, | |
| "epoch": 12.4, | |
| "grad_norm": 3.337940216064453, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1248, | |
| "mean_token_accuracy": 0.948885977268219, | |
| "num_tokens": 64988681.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.4996580481529236, | |
| "epoch": 12.45, | |
| "grad_norm": 3.4728662967681885, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1167, | |
| "mean_token_accuracy": 0.9493753910064697, | |
| "num_tokens": 65250762.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.49834275245666504, | |
| "epoch": 12.5, | |
| "grad_norm": 5.884078502655029, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1132, | |
| "mean_token_accuracy": 0.9457477927207947, | |
| "num_tokens": 65512785.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5003111362457275, | |
| "epoch": 12.55, | |
| "grad_norm": 2.507913112640381, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1098, | |
| "mean_token_accuracy": 0.9487054347991943, | |
| "num_tokens": 65774831.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.49748995900154114, | |
| "epoch": 12.6, | |
| "grad_norm": 3.47552490234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1201, | |
| "mean_token_accuracy": 0.9420111179351807, | |
| "num_tokens": 66036880.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.4972376227378845, | |
| "epoch": 12.65, | |
| "grad_norm": 4.500434875488281, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1166, | |
| "mean_token_accuracy": 0.9459459185600281, | |
| "num_tokens": 66298966.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.49823814630508423, | |
| "epoch": 12.7, | |
| "grad_norm": 4.090944290161133, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1162, | |
| "mean_token_accuracy": 0.943792462348938, | |
| "num_tokens": 66560980.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.4966731369495392, | |
| "epoch": 12.75, | |
| "grad_norm": 4.648547649383545, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1449, | |
| "mean_token_accuracy": 0.937831699848175, | |
| "num_tokens": 66823057.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.49896514415740967, | |
| "epoch": 12.8, | |
| "grad_norm": 3.447160482406616, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1325, | |
| "mean_token_accuracy": 0.939793586730957, | |
| "num_tokens": 67085064.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.501239538192749, | |
| "epoch": 12.85, | |
| "grad_norm": 3.2995057106018066, | |
| "learning_rate": 1e-06, | |
| "loss": 0.133, | |
| "mean_token_accuracy": 0.9414660930633545, | |
| "num_tokens": 67347122.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.4989623427391052, | |
| "epoch": 12.9, | |
| "grad_norm": 3.629384756088257, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1205, | |
| "mean_token_accuracy": 0.9440922141075134, | |
| "num_tokens": 67609153.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.497197687625885, | |
| "epoch": 12.95, | |
| "grad_norm": 4.829705715179443, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.9379671216011047, | |
| "num_tokens": 67871216.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.49780064821243286, | |
| "epoch": 13.0, | |
| "grad_norm": 4.333249092102051, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1023, | |
| "mean_token_accuracy": 0.9474367499351501, | |
| "num_tokens": 68133284.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_entropy": 0.503221333026886, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9052419066429138, | |
| "eval_num_tokens": 68133284.0, | |
| "eval_runtime": 0.561, | |
| "eval_samples_per_second": 445.601, | |
| "eval_steps_per_second": 1.782, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5047539472579956, | |
| "epoch": 13.05, | |
| "grad_norm": 3.8571035861968994, | |
| "learning_rate": 1e-06, | |
| "loss": 0.119, | |
| "mean_token_accuracy": 0.9480260014533997, | |
| "num_tokens": 68395350.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5000075101852417, | |
| "epoch": 13.1, | |
| "grad_norm": 3.3609304428100586, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1286, | |
| "mean_token_accuracy": 0.9436339735984802, | |
| "num_tokens": 68657418.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5047616958618164, | |
| "epoch": 13.15, | |
| "grad_norm": 2.9678988456726074, | |
| "learning_rate": 1e-06, | |
| "loss": 0.103, | |
| "mean_token_accuracy": 0.9512548446655273, | |
| "num_tokens": 68919456.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.4988555312156677, | |
| "epoch": 13.2, | |
| "grad_norm": 3.5749735832214355, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1156, | |
| "mean_token_accuracy": 0.948113203048706, | |
| "num_tokens": 69181519.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5020204782485962, | |
| "epoch": 13.25, | |
| "grad_norm": 3.25724196434021, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1086, | |
| "mean_token_accuracy": 0.9557783007621765, | |
| "num_tokens": 69443544.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.500059187412262, | |
| "epoch": 13.3, | |
| "grad_norm": 3.753115177154541, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1237, | |
| "mean_token_accuracy": 0.9440914988517761, | |
| "num_tokens": 69705579.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5037001967430115, | |
| "epoch": 13.35, | |
| "grad_norm": 3.255347728729248, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1105, | |
| "mean_token_accuracy": 0.9521912336349487, | |
| "num_tokens": 69967586.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.4993184208869934, | |
| "epoch": 13.4, | |
| "grad_norm": 3.5563864707946777, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1111, | |
| "mean_token_accuracy": 0.9501557350158691, | |
| "num_tokens": 70229652.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.4983921945095062, | |
| "epoch": 13.45, | |
| "grad_norm": 3.5320169925689697, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1116, | |
| "mean_token_accuracy": 0.9436893463134766, | |
| "num_tokens": 70491691.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.498714804649353, | |
| "epoch": 13.5, | |
| "grad_norm": 3.004915475845337, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1126, | |
| "mean_token_accuracy": 0.950441300868988, | |
| "num_tokens": 70753724.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.4960983693599701, | |
| "epoch": 13.55, | |
| "grad_norm": 4.270773887634277, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1157, | |
| "mean_token_accuracy": 0.9450740814208984, | |
| "num_tokens": 71015794.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.4993487000465393, | |
| "epoch": 13.6, | |
| "grad_norm": 4.245420932769775, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1192, | |
| "mean_token_accuracy": 0.9475739002227783, | |
| "num_tokens": 71277883.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.49819624423980713, | |
| "epoch": 13.65, | |
| "grad_norm": 4.052130222320557, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1226, | |
| "mean_token_accuracy": 0.9484173655509949, | |
| "num_tokens": 71539951.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.4982229471206665, | |
| "epoch": 13.7, | |
| "grad_norm": 4.078166484832764, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.9399612545967102, | |
| "num_tokens": 71802027.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5001723766326904, | |
| "epoch": 13.75, | |
| "grad_norm": 3.4441871643066406, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1181, | |
| "mean_token_accuracy": 0.9488428831100464, | |
| "num_tokens": 72064064.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.5003495812416077, | |
| "epoch": 13.8, | |
| "grad_norm": 4.0370097160339355, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1247, | |
| "mean_token_accuracy": 0.9390096664428711, | |
| "num_tokens": 72326115.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.4975491166114807, | |
| "epoch": 13.85, | |
| "grad_norm": 3.9948337078094482, | |
| "learning_rate": 1e-06, | |
| "loss": 0.121, | |
| "mean_token_accuracy": 0.9438806176185608, | |
| "num_tokens": 72588152.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.4980151653289795, | |
| "epoch": 13.9, | |
| "grad_norm": 3.5774476528167725, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1213, | |
| "mean_token_accuracy": 0.9462665915489197, | |
| "num_tokens": 72850178.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.4962918162345886, | |
| "epoch": 13.95, | |
| "grad_norm": 3.5639283657073975, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1243, | |
| "mean_token_accuracy": 0.9504778385162354, | |
| "num_tokens": 73112252.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.49853619933128357, | |
| "epoch": 14.0, | |
| "grad_norm": 3.286870241165161, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1101, | |
| "mean_token_accuracy": 0.9513888955116272, | |
| "num_tokens": 73374318.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_entropy": 0.4983806610107422, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9052419066429138, | |
| "eval_num_tokens": 73374318.0, | |
| "eval_runtime": 0.5649, | |
| "eval_samples_per_second": 442.518, | |
| "eval_steps_per_second": 1.77, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.4951111078262329, | |
| "epoch": 14.05, | |
| "grad_norm": 3.988466739654541, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1101, | |
| "mean_token_accuracy": 0.9488795399665833, | |
| "num_tokens": 73636405.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.498315691947937, | |
| "epoch": 14.1, | |
| "grad_norm": 3.465620517730713, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1114, | |
| "mean_token_accuracy": 0.9480925798416138, | |
| "num_tokens": 73898432.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.4973567724227905, | |
| "epoch": 14.15, | |
| "grad_norm": 3.7496891021728516, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1083, | |
| "mean_token_accuracy": 0.9504950642585754, | |
| "num_tokens": 74160476.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.4969732165336609, | |
| "epoch": 14.2, | |
| "grad_norm": 3.5036423206329346, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1131, | |
| "mean_token_accuracy": 0.9461114406585693, | |
| "num_tokens": 74422504.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.49568063020706177, | |
| "epoch": 14.25, | |
| "grad_norm": 3.9930689334869385, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1154, | |
| "mean_token_accuracy": 0.9531859755516052, | |
| "num_tokens": 74684569.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.4962500035762787, | |
| "epoch": 14.3, | |
| "grad_norm": 2.8734872341156006, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1192, | |
| "mean_token_accuracy": 0.9498327970504761, | |
| "num_tokens": 74946593.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.4961685836315155, | |
| "epoch": 14.35, | |
| "grad_norm": 3.3552212715148926, | |
| "learning_rate": 1e-06, | |
| "loss": 0.121, | |
| "mean_token_accuracy": 0.9423274993896484, | |
| "num_tokens": 75208633.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.49256008863449097, | |
| "epoch": 14.4, | |
| "grad_norm": 3.5463008880615234, | |
| "learning_rate": 1e-06, | |
| "loss": 0.111, | |
| "mean_token_accuracy": 0.9447004795074463, | |
| "num_tokens": 75470693.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.4943183660507202, | |
| "epoch": 14.45, | |
| "grad_norm": 3.921447277069092, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1058, | |
| "mean_token_accuracy": 0.953438401222229, | |
| "num_tokens": 75732748.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.49364545941352844, | |
| "epoch": 14.5, | |
| "grad_norm": 3.0754876136779785, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1173, | |
| "mean_token_accuracy": 0.9483187794685364, | |
| "num_tokens": 75994815.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.4927959144115448, | |
| "epoch": 14.55, | |
| "grad_norm": 2.622016191482544, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1096, | |
| "mean_token_accuracy": 0.9544615149497986, | |
| "num_tokens": 76256835.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.4929812550544739, | |
| "epoch": 14.6, | |
| "grad_norm": 4.265964508056641, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1088, | |
| "mean_token_accuracy": 0.953329861164093, | |
| "num_tokens": 76518873.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.49077093601226807, | |
| "epoch": 14.65, | |
| "grad_norm": 4.118034839630127, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1189, | |
| "mean_token_accuracy": 0.9410150647163391, | |
| "num_tokens": 76780957.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.4885583221912384, | |
| "epoch": 14.7, | |
| "grad_norm": 4.893588066101074, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0994, | |
| "mean_token_accuracy": 0.9585448503494263, | |
| "num_tokens": 77042996.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.48888856172561646, | |
| "epoch": 14.75, | |
| "grad_norm": 4.3738789558410645, | |
| "learning_rate": 1e-06, | |
| "loss": 0.138, | |
| "mean_token_accuracy": 0.9407114386558533, | |
| "num_tokens": 77305052.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.49154844880104065, | |
| "epoch": 14.8, | |
| "grad_norm": 6.126094341278076, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1172, | |
| "mean_token_accuracy": 0.954346776008606, | |
| "num_tokens": 77567110.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.490234911441803, | |
| "epoch": 14.85, | |
| "grad_norm": 5.756350994110107, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1215, | |
| "mean_token_accuracy": 0.9456824660301208, | |
| "num_tokens": 77829205.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.4910707175731659, | |
| "epoch": 14.9, | |
| "grad_norm": 3.7809011936187744, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1062, | |
| "mean_token_accuracy": 0.9476373195648193, | |
| "num_tokens": 78091232.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.49150994420051575, | |
| "epoch": 14.95, | |
| "grad_norm": 3.2236623764038086, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1213, | |
| "mean_token_accuracy": 0.9479166865348816, | |
| "num_tokens": 78353286.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.49145615100860596, | |
| "epoch": 15.0, | |
| "grad_norm": 2.271028757095337, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1012, | |
| "mean_token_accuracy": 0.9547767043113708, | |
| "num_tokens": 78615351.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_entropy": 0.4905474781990051, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9045698642730713, | |
| "eval_num_tokens": 78615351.0, | |
| "eval_runtime": 0.5672, | |
| "eval_samples_per_second": 440.741, | |
| "eval_steps_per_second": 1.763, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.4869128465652466, | |
| "epoch": 15.05, | |
| "grad_norm": 5.4611053466796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1218, | |
| "mean_token_accuracy": 0.940838098526001, | |
| "num_tokens": 78877392.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.491014301776886, | |
| "epoch": 15.1, | |
| "grad_norm": 3.0112688541412354, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1045, | |
| "mean_token_accuracy": 0.9524050354957581, | |
| "num_tokens": 79139432.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.49170082807540894, | |
| "epoch": 15.15, | |
| "grad_norm": 4.067041873931885, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1254, | |
| "mean_token_accuracy": 0.9467312097549438, | |
| "num_tokens": 79401479.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.49128258228302, | |
| "epoch": 15.2, | |
| "grad_norm": 3.7372446060180664, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1062, | |
| "mean_token_accuracy": 0.952275276184082, | |
| "num_tokens": 79663544.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.48871222138404846, | |
| "epoch": 15.25, | |
| "grad_norm": 3.4806947708129883, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1028, | |
| "mean_token_accuracy": 0.9505928754806519, | |
| "num_tokens": 79925622.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.4901201128959656, | |
| "epoch": 15.3, | |
| "grad_norm": 3.2800400257110596, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1022, | |
| "mean_token_accuracy": 0.9538653492927551, | |
| "num_tokens": 80187687.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.49247848987579346, | |
| "epoch": 15.35, | |
| "grad_norm": 2.735215663909912, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1091, | |
| "mean_token_accuracy": 0.9488189220428467, | |
| "num_tokens": 80449695.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.4879264235496521, | |
| "epoch": 15.4, | |
| "grad_norm": 3.8763179779052734, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1018, | |
| "mean_token_accuracy": 0.9574912786483765, | |
| "num_tokens": 80711756.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.489043265581131, | |
| "epoch": 15.45, | |
| "grad_norm": 3.1737430095672607, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1091, | |
| "mean_token_accuracy": 0.949999988079071, | |
| "num_tokens": 80973817.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.4896194338798523, | |
| "epoch": 15.5, | |
| "grad_norm": 2.9024124145507812, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1076, | |
| "mean_token_accuracy": 0.9545205235481262, | |
| "num_tokens": 81235839.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.4866348206996918, | |
| "epoch": 15.55, | |
| "grad_norm": 2.981309175491333, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1096, | |
| "mean_token_accuracy": 0.9505454301834106, | |
| "num_tokens": 81497873.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.4875825047492981, | |
| "epoch": 15.6, | |
| "grad_norm": 3.687138319015503, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1089, | |
| "mean_token_accuracy": 0.9505016803741455, | |
| "num_tokens": 81759928.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.48804572224617004, | |
| "epoch": 15.65, | |
| "grad_norm": 3.807471752166748, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1089, | |
| "mean_token_accuracy": 0.9481101632118225, | |
| "num_tokens": 82021983.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.48681819438934326, | |
| "epoch": 15.7, | |
| "grad_norm": 3.4905779361724854, | |
| "learning_rate": 1e-06, | |
| "loss": 0.099, | |
| "mean_token_accuracy": 0.9566075205802917, | |
| "num_tokens": 82284064.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.48834747076034546, | |
| "epoch": 15.75, | |
| "grad_norm": 5.331181526184082, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1124, | |
| "mean_token_accuracy": 0.9493902325630188, | |
| "num_tokens": 82546132.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.48611488938331604, | |
| "epoch": 15.8, | |
| "grad_norm": 3.41743803024292, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1213, | |
| "mean_token_accuracy": 0.9472049474716187, | |
| "num_tokens": 82808203.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.4868428409099579, | |
| "epoch": 15.85, | |
| "grad_norm": 4.189897537231445, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1092, | |
| "mean_token_accuracy": 0.9522203207015991, | |
| "num_tokens": 83070245.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.48746997117996216, | |
| "epoch": 15.9, | |
| "grad_norm": 4.698352813720703, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1084, | |
| "mean_token_accuracy": 0.9526795744895935, | |
| "num_tokens": 83332229.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.4847297966480255, | |
| "epoch": 15.95, | |
| "grad_norm": 3.628556728363037, | |
| "learning_rate": 1e-06, | |
| "loss": 0.116, | |
| "mean_token_accuracy": 0.9458874464035034, | |
| "num_tokens": 83594307.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.4873000979423523, | |
| "epoch": 16.0, | |
| "grad_norm": 3.9242656230926514, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1136, | |
| "mean_token_accuracy": 0.944888174533844, | |
| "num_tokens": 83856350.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_entropy": 0.489590585231781, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9065860509872437, | |
| "eval_num_tokens": 83856350.0, | |
| "eval_runtime": 0.564, | |
| "eval_samples_per_second": 443.283, | |
| "eval_steps_per_second": 1.773, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.4858003854751587, | |
| "epoch": 16.05, | |
| "grad_norm": 4.172854423522949, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1193, | |
| "mean_token_accuracy": 0.946601927280426, | |
| "num_tokens": 84118410.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.48904526233673096, | |
| "epoch": 16.1, | |
| "grad_norm": 3.2139930725097656, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1026, | |
| "mean_token_accuracy": 0.9490445852279663, | |
| "num_tokens": 84380441.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.48847872018814087, | |
| "epoch": 16.15, | |
| "grad_norm": 3.9387967586517334, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1079, | |
| "mean_token_accuracy": 0.9527458548545837, | |
| "num_tokens": 84642468.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.48851415514945984, | |
| "epoch": 16.2, | |
| "grad_norm": 3.1942989826202393, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1178, | |
| "mean_token_accuracy": 0.9528598189353943, | |
| "num_tokens": 84904553.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.4875437021255493, | |
| "epoch": 16.25, | |
| "grad_norm": 4.474672317504883, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0876, | |
| "mean_token_accuracy": 0.9609507918357849, | |
| "num_tokens": 85166583.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.49070078134536743, | |
| "epoch": 16.3, | |
| "grad_norm": 3.77111554145813, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1109, | |
| "mean_token_accuracy": 0.9536523818969727, | |
| "num_tokens": 85428633.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.4889669418334961, | |
| "epoch": 16.35, | |
| "grad_norm": 3.3292832374572754, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1066, | |
| "mean_token_accuracy": 0.9531335234642029, | |
| "num_tokens": 85690632.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.48486199975013733, | |
| "epoch": 16.4, | |
| "grad_norm": 3.8034586906433105, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1161, | |
| "mean_token_accuracy": 0.9467918872833252, | |
| "num_tokens": 85952701.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.4868103563785553, | |
| "epoch": 16.45, | |
| "grad_norm": 2.931748151779175, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0993, | |
| "mean_token_accuracy": 0.9569321274757385, | |
| "num_tokens": 86214758.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.48670750856399536, | |
| "epoch": 16.5, | |
| "grad_norm": 4.134925842285156, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1098, | |
| "mean_token_accuracy": 0.9492447376251221, | |
| "num_tokens": 86476808.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.485757052898407, | |
| "epoch": 16.55, | |
| "grad_norm": 3.8004045486450195, | |
| "learning_rate": 1e-06, | |
| "loss": 0.106, | |
| "mean_token_accuracy": 0.9532483220100403, | |
| "num_tokens": 86738817.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.4827927350997925, | |
| "epoch": 16.6, | |
| "grad_norm": 4.365555286407471, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0962, | |
| "mean_token_accuracy": 0.9560723304748535, | |
| "num_tokens": 87000859.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.4798928499221802, | |
| "epoch": 16.65, | |
| "grad_norm": 4.611724376678467, | |
| "learning_rate": 1e-06, | |
| "loss": 0.108, | |
| "mean_token_accuracy": 0.9538087248802185, | |
| "num_tokens": 87262950.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.48096251487731934, | |
| "epoch": 16.7, | |
| "grad_norm": 4.28861665725708, | |
| "learning_rate": 1e-06, | |
| "loss": 0.103, | |
| "mean_token_accuracy": 0.9518492817878723, | |
| "num_tokens": 87525009.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.484794557094574, | |
| "epoch": 16.75, | |
| "grad_norm": 3.724881172180176, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1138, | |
| "mean_token_accuracy": 0.9499734044075012, | |
| "num_tokens": 87787052.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.4819689393043518, | |
| "epoch": 16.8, | |
| "grad_norm": 5.316562652587891, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0998, | |
| "mean_token_accuracy": 0.9543702006340027, | |
| "num_tokens": 88049135.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.4811255931854248, | |
| "epoch": 16.85, | |
| "grad_norm": 4.379755973815918, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1125, | |
| "mean_token_accuracy": 0.9438552856445312, | |
| "num_tokens": 88311199.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.4819214940071106, | |
| "epoch": 16.9, | |
| "grad_norm": 3.4126381874084473, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1071, | |
| "mean_token_accuracy": 0.9480443596839905, | |
| "num_tokens": 88573241.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.48113828897476196, | |
| "epoch": 16.95, | |
| "grad_norm": 4.438907146453857, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1032, | |
| "mean_token_accuracy": 0.953698456287384, | |
| "num_tokens": 88835308.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.4829384684562683, | |
| "epoch": 17.0, | |
| "grad_norm": 4.242271423339844, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1189, | |
| "mean_token_accuracy": 0.949438214302063, | |
| "num_tokens": 89097378.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_entropy": 0.48575037717819214, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9059139490127563, | |
| "eval_num_tokens": 89097378.0, | |
| "eval_runtime": 0.5659, | |
| "eval_samples_per_second": 441.736, | |
| "eval_steps_per_second": 1.767, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.4829714000225067, | |
| "epoch": 17.05, | |
| "grad_norm": 4.21494197845459, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1039, | |
| "mean_token_accuracy": 0.954402506351471, | |
| "num_tokens": 89359441.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.4828266501426697, | |
| "epoch": 17.1, | |
| "grad_norm": 3.5206823348999023, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1119, | |
| "mean_token_accuracy": 0.9511111378669739, | |
| "num_tokens": 89621477.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.48437392711639404, | |
| "epoch": 17.15, | |
| "grad_norm": 4.2214674949646, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1004, | |
| "mean_token_accuracy": 0.9563080072402954, | |
| "num_tokens": 89883571.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.4833766222000122, | |
| "epoch": 17.2, | |
| "grad_norm": 4.171907901763916, | |
| "learning_rate": 1e-06, | |
| "loss": 0.105, | |
| "mean_token_accuracy": 0.953951895236969, | |
| "num_tokens": 90145619.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.48499199748039246, | |
| "epoch": 17.25, | |
| "grad_norm": 3.7562005519866943, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1001, | |
| "mean_token_accuracy": 0.9547511339187622, | |
| "num_tokens": 90407683.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.48458331823349, | |
| "epoch": 17.3, | |
| "grad_norm": 3.6610958576202393, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1032, | |
| "mean_token_accuracy": 0.9532163739204407, | |
| "num_tokens": 90669722.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.4811995327472687, | |
| "epoch": 17.35, | |
| "grad_norm": 3.4695615768432617, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1127, | |
| "mean_token_accuracy": 0.9553333520889282, | |
| "num_tokens": 90931782.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.4832395017147064, | |
| "epoch": 17.4, | |
| "grad_norm": 4.198061466217041, | |
| "learning_rate": 1e-06, | |
| "loss": 0.088, | |
| "mean_token_accuracy": 0.9628930687904358, | |
| "num_tokens": 91193833.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.4821211099624634, | |
| "epoch": 17.45, | |
| "grad_norm": 3.404797315597534, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0964, | |
| "mean_token_accuracy": 0.9528061151504517, | |
| "num_tokens": 91455895.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.479155957698822, | |
| "epoch": 17.5, | |
| "grad_norm": 5.393930912017822, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1076, | |
| "mean_token_accuracy": 0.9529499411582947, | |
| "num_tokens": 91717992.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.48262494802474976, | |
| "epoch": 17.55, | |
| "grad_norm": 3.950324535369873, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1016, | |
| "mean_token_accuracy": 0.9538905024528503, | |
| "num_tokens": 91979974.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.4808007478713989, | |
| "epoch": 17.6, | |
| "grad_norm": 5.840694427490234, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0978, | |
| "mean_token_accuracy": 0.9548532962799072, | |
| "num_tokens": 92242028.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.4793074131011963, | |
| "epoch": 17.65, | |
| "grad_norm": 4.341586112976074, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1026, | |
| "mean_token_accuracy": 0.9537906050682068, | |
| "num_tokens": 92504072.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.48200857639312744, | |
| "epoch": 17.7, | |
| "grad_norm": 4.7615485191345215, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0989, | |
| "mean_token_accuracy": 0.9592834115028381, | |
| "num_tokens": 92766111.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.4793251156806946, | |
| "epoch": 17.75, | |
| "grad_norm": 4.265474796295166, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0984, | |
| "mean_token_accuracy": 0.9533022046089172, | |
| "num_tokens": 93028137.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.48376020789146423, | |
| "epoch": 17.8, | |
| "grad_norm": 4.087716579437256, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1104, | |
| "mean_token_accuracy": 0.9477089047431946, | |
| "num_tokens": 93290189.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.48131391406059265, | |
| "epoch": 17.85, | |
| "grad_norm": 3.9392213821411133, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1016, | |
| "mean_token_accuracy": 0.9560810923576355, | |
| "num_tokens": 93552229.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.4821656346321106, | |
| "epoch": 17.9, | |
| "grad_norm": 4.806204795837402, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9533898234367371, | |
| "num_tokens": 93814276.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.4840206801891327, | |
| "epoch": 17.95, | |
| "grad_norm": 4.974476337432861, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0998, | |
| "mean_token_accuracy": 0.9556295275688171, | |
| "num_tokens": 94076342.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.48071110248565674, | |
| "epoch": 18.0, | |
| "grad_norm": 3.907980442047119, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1086, | |
| "mean_token_accuracy": 0.9533995389938354, | |
| "num_tokens": 94338409.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_entropy": 0.4855648875236511, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9072580933570862, | |
| "eval_num_tokens": 94338409.0, | |
| "eval_runtime": 0.5648, | |
| "eval_samples_per_second": 442.64, | |
| "eval_steps_per_second": 1.771, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.4835587739944458, | |
| "epoch": 18.05, | |
| "grad_norm": 3.5856211185455322, | |
| "learning_rate": 1e-06, | |
| "loss": 0.106, | |
| "mean_token_accuracy": 0.9543883800506592, | |
| "num_tokens": 94600449.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.4841495454311371, | |
| "epoch": 18.1, | |
| "grad_norm": 3.932619094848633, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0988, | |
| "mean_token_accuracy": 0.9541892409324646, | |
| "num_tokens": 94862470.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.48350268602371216, | |
| "epoch": 18.15, | |
| "grad_norm": 3.5396127700805664, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0872, | |
| "mean_token_accuracy": 0.9625223278999329, | |
| "num_tokens": 95124546.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.48403599858283997, | |
| "epoch": 18.2, | |
| "grad_norm": 3.43064546585083, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0871, | |
| "mean_token_accuracy": 0.9590017795562744, | |
| "num_tokens": 95386591.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.4856931269168854, | |
| "epoch": 18.25, | |
| "grad_norm": 3.188349485397339, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0961, | |
| "mean_token_accuracy": 0.9583789706230164, | |
| "num_tokens": 95648581.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.48474887013435364, | |
| "epoch": 18.3, | |
| "grad_norm": 2.6797800064086914, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0945, | |
| "mean_token_accuracy": 0.9626865386962891, | |
| "num_tokens": 95910617.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.48159462213516235, | |
| "epoch": 18.35, | |
| "grad_norm": 4.948982238769531, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1159, | |
| "mean_token_accuracy": 0.9509345889091492, | |
| "num_tokens": 96172692.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.48261338472366333, | |
| "epoch": 18.4, | |
| "grad_norm": 4.678440093994141, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1114, | |
| "mean_token_accuracy": 0.9544126391410828, | |
| "num_tokens": 96434732.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.47885391116142273, | |
| "epoch": 18.45, | |
| "grad_norm": 6.533933639526367, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0894, | |
| "mean_token_accuracy": 0.9582701325416565, | |
| "num_tokens": 96696808.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.47860729694366455, | |
| "epoch": 18.5, | |
| "grad_norm": 4.395998001098633, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1031, | |
| "mean_token_accuracy": 0.9530686140060425, | |
| "num_tokens": 96958885.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.4792514443397522, | |
| "epoch": 18.55, | |
| "grad_norm": 5.65232515335083, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0967, | |
| "mean_token_accuracy": 0.9528796076774597, | |
| "num_tokens": 97220973.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.47906193137168884, | |
| "epoch": 18.6, | |
| "grad_norm": 4.153817176818848, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0983, | |
| "mean_token_accuracy": 0.9545454382896423, | |
| "num_tokens": 97483051.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.4794267416000366, | |
| "epoch": 18.65, | |
| "grad_norm": 4.057419300079346, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0891, | |
| "mean_token_accuracy": 0.9647576808929443, | |
| "num_tokens": 97745101.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.4788510203361511, | |
| "epoch": 18.7, | |
| "grad_norm": 4.535802841186523, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1052, | |
| "mean_token_accuracy": 0.955997884273529, | |
| "num_tokens": 98007141.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.4793304204940796, | |
| "epoch": 18.75, | |
| "grad_norm": 3.66812801361084, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0972, | |
| "mean_token_accuracy": 0.95691978931427, | |
| "num_tokens": 98269129.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.4771343767642975, | |
| "epoch": 18.8, | |
| "grad_norm": 5.437928199768066, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1163, | |
| "mean_token_accuracy": 0.9437780976295471, | |
| "num_tokens": 98531221.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.4786272644996643, | |
| "epoch": 18.85, | |
| "grad_norm": 7.437087059020996, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1095, | |
| "mean_token_accuracy": 0.9502487778663635, | |
| "num_tokens": 98793257.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.47760748863220215, | |
| "epoch": 18.9, | |
| "grad_norm": 4.315995216369629, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0996, | |
| "mean_token_accuracy": 0.9553039073944092, | |
| "num_tokens": 99055330.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.4789770841598511, | |
| "epoch": 18.95, | |
| "grad_norm": 3.436211109161377, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0968, | |
| "mean_token_accuracy": 0.9601989984512329, | |
| "num_tokens": 99317366.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.4768607020378113, | |
| "epoch": 19.0, | |
| "grad_norm": 4.5564093589782715, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1028, | |
| "mean_token_accuracy": 0.9576333165168762, | |
| "num_tokens": 99579427.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_entropy": 0.48156681656837463, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9072580933570862, | |
| "eval_num_tokens": 99579427.0, | |
| "eval_runtime": 0.5632, | |
| "eval_samples_per_second": 443.912, | |
| "eval_steps_per_second": 1.776, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.4807729125022888, | |
| "epoch": 19.05, | |
| "grad_norm": 2.999615430831909, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0948, | |
| "mean_token_accuracy": 0.9598582983016968, | |
| "num_tokens": 99841483.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.47727420926094055, | |
| "epoch": 19.1, | |
| "grad_norm": 3.7125136852264404, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.9632495045661926, | |
| "num_tokens": 100103528.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.4780922532081604, | |
| "epoch": 19.15, | |
| "grad_norm": 3.4127087593078613, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1037, | |
| "mean_token_accuracy": 0.955041766166687, | |
| "num_tokens": 100365579.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.47612839937210083, | |
| "epoch": 19.2, | |
| "grad_norm": 5.690220832824707, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0873, | |
| "mean_token_accuracy": 0.9584121108055115, | |
| "num_tokens": 100627627.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.4755370616912842, | |
| "epoch": 19.25, | |
| "grad_norm": 4.630006790161133, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1026, | |
| "mean_token_accuracy": 0.9599140882492065, | |
| "num_tokens": 100889716.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.4773571789264679, | |
| "epoch": 19.3, | |
| "grad_norm": 4.160724639892578, | |
| "learning_rate": 1e-06, | |
| "loss": 0.092, | |
| "mean_token_accuracy": 0.9613651037216187, | |
| "num_tokens": 101151796.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.4772469997406006, | |
| "epoch": 19.35, | |
| "grad_norm": 4.370746612548828, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0965, | |
| "mean_token_accuracy": 0.9557135105133057, | |
| "num_tokens": 101413822.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.4769657552242279, | |
| "epoch": 19.4, | |
| "grad_norm": 3.9834535121917725, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0828, | |
| "mean_token_accuracy": 0.9644970297813416, | |
| "num_tokens": 101675903.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.4751873016357422, | |
| "epoch": 19.45, | |
| "grad_norm": 5.0992021560668945, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0923, | |
| "mean_token_accuracy": 0.9595441818237305, | |
| "num_tokens": 101937954.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.47450515627861023, | |
| "epoch": 19.5, | |
| "grad_norm": 6.339524269104004, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1009, | |
| "mean_token_accuracy": 0.9467408657073975, | |
| "num_tokens": 102200003.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.4756562411785126, | |
| "epoch": 19.55, | |
| "grad_norm": 4.202500820159912, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0917, | |
| "mean_token_accuracy": 0.9617834687232971, | |
| "num_tokens": 102462034.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.47639748454093933, | |
| "epoch": 19.6, | |
| "grad_norm": 4.514294147491455, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0952, | |
| "mean_token_accuracy": 0.9570673704147339, | |
| "num_tokens": 102724075.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.4771527051925659, | |
| "epoch": 19.65, | |
| "grad_norm": 4.23642110824585, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0959, | |
| "mean_token_accuracy": 0.9596773982048035, | |
| "num_tokens": 102986140.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.476939857006073, | |
| "epoch": 19.7, | |
| "grad_norm": 3.8977198600769043, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0832, | |
| "mean_token_accuracy": 0.9618320465087891, | |
| "num_tokens": 103248205.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.4789218604564667, | |
| "epoch": 19.75, | |
| "grad_norm": 4.690950393676758, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1086, | |
| "mean_token_accuracy": 0.9504048824310303, | |
| "num_tokens": 103510180.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.4752090275287628, | |
| "epoch": 19.8, | |
| "grad_norm": 3.9899864196777344, | |
| "learning_rate": 1e-06, | |
| "loss": 0.091, | |
| "mean_token_accuracy": 0.9608516693115234, | |
| "num_tokens": 103772229.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.47282856702804565, | |
| "epoch": 19.85, | |
| "grad_norm": 5.252200126647949, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1034, | |
| "mean_token_accuracy": 0.9577922224998474, | |
| "num_tokens": 104034285.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.47455742955207825, | |
| "epoch": 19.9, | |
| "grad_norm": 7.813296318054199, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1068, | |
| "mean_token_accuracy": 0.9442644119262695, | |
| "num_tokens": 104296355.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.4774863123893738, | |
| "epoch": 19.95, | |
| "grad_norm": 8.987563133239746, | |
| "learning_rate": 1e-06, | |
| "loss": 0.121, | |
| "mean_token_accuracy": 0.9434475302696228, | |
| "num_tokens": 104558391.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.4767053723335266, | |
| "epoch": 20.0, | |
| "grad_norm": 5.698646068572998, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0958, | |
| "mean_token_accuracy": 0.9530162215232849, | |
| "num_tokens": 104820444.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_entropy": 0.4785197973251343, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9038978219032288, | |
| "eval_num_tokens": 104820444.0, | |
| "eval_runtime": 0.5628, | |
| "eval_samples_per_second": 444.175, | |
| "eval_steps_per_second": 1.777, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.47759389877319336, | |
| "epoch": 20.05, | |
| "grad_norm": 5.164842128753662, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0958, | |
| "mean_token_accuracy": 0.9596510529518127, | |
| "num_tokens": 105082475.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.4755489230155945, | |
| "epoch": 20.1, | |
| "grad_norm": 4.541907787322998, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0988, | |
| "mean_token_accuracy": 0.9554093480110168, | |
| "num_tokens": 105344502.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.4764818847179413, | |
| "epoch": 20.15, | |
| "grad_norm": 4.786900043487549, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1111, | |
| "mean_token_accuracy": 0.9530423283576965, | |
| "num_tokens": 105606574.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.4768408536911011, | |
| "epoch": 20.2, | |
| "grad_norm": 5.436928749084473, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1035, | |
| "mean_token_accuracy": 0.9548913240432739, | |
| "num_tokens": 105868611.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.4776271879673004, | |
| "epoch": 20.25, | |
| "grad_norm": 6.8953657150268555, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1086, | |
| "mean_token_accuracy": 0.9477487206459045, | |
| "num_tokens": 106130673.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.47521114349365234, | |
| "epoch": 20.3, | |
| "grad_norm": 5.883774280548096, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1033, | |
| "mean_token_accuracy": 0.948503851890564, | |
| "num_tokens": 106392769.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.47845274209976196, | |
| "epoch": 20.35, | |
| "grad_norm": 3.9064784049987793, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0857, | |
| "mean_token_accuracy": 0.9641460180282593, | |
| "num_tokens": 106654830.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.4787122309207916, | |
| "epoch": 20.4, | |
| "grad_norm": 3.2227232456207275, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0992, | |
| "mean_token_accuracy": 0.9579360485076904, | |
| "num_tokens": 106916876.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.4786139130592346, | |
| "epoch": 20.45, | |
| "grad_norm": 3.6466708183288574, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.9564660787582397, | |
| "num_tokens": 107178932.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.47711610794067383, | |
| "epoch": 20.5, | |
| "grad_norm": 5.3844194412231445, | |
| "learning_rate": 1e-06, | |
| "loss": 0.098, | |
| "mean_token_accuracy": 0.9515488743782043, | |
| "num_tokens": 107440982.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.47591736912727356, | |
| "epoch": 20.55, | |
| "grad_norm": 4.034522533416748, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0857, | |
| "mean_token_accuracy": 0.9637036919593811, | |
| "num_tokens": 107703057.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.4796789884567261, | |
| "epoch": 20.6, | |
| "grad_norm": 3.7229764461517334, | |
| "learning_rate": 1e-06, | |
| "loss": 0.084, | |
| "mean_token_accuracy": 0.9600798487663269, | |
| "num_tokens": 107965126.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.47761350870132446, | |
| "epoch": 20.65, | |
| "grad_norm": 3.5426137447357178, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0894, | |
| "mean_token_accuracy": 0.9674220681190491, | |
| "num_tokens": 108227164.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.480240136384964, | |
| "epoch": 20.7, | |
| "grad_norm": 3.649472713470459, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0876, | |
| "mean_token_accuracy": 0.9633389711380005, | |
| "num_tokens": 108489167.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.4761279821395874, | |
| "epoch": 20.75, | |
| "grad_norm": 4.2589616775512695, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.96128249168396, | |
| "num_tokens": 108751215.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.47693654894828796, | |
| "epoch": 20.8, | |
| "grad_norm": 4.516826152801514, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0915, | |
| "mean_token_accuracy": 0.9612069129943848, | |
| "num_tokens": 109013235.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.4774542450904846, | |
| "epoch": 20.85, | |
| "grad_norm": 3.8276429176330566, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.9607046246528625, | |
| "num_tokens": 109275304.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.47494709491729736, | |
| "epoch": 20.9, | |
| "grad_norm": 4.62904167175293, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0916, | |
| "mean_token_accuracy": 0.9534883499145508, | |
| "num_tokens": 109537356.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.4711452126502991, | |
| "epoch": 20.95, | |
| "grad_norm": 4.15134334564209, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0912, | |
| "mean_token_accuracy": 0.9620253443717957, | |
| "num_tokens": 109799431.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.47310134768486023, | |
| "epoch": 21.0, | |
| "grad_norm": 6.700887680053711, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0948, | |
| "mean_token_accuracy": 0.9579694271087646, | |
| "num_tokens": 110061460.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_entropy": 0.4740469455718994, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9059139490127563, | |
| "eval_num_tokens": 110061460.0, | |
| "eval_runtime": 0.5634, | |
| "eval_samples_per_second": 443.746, | |
| "eval_steps_per_second": 1.775, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.470045268535614, | |
| "epoch": 21.05, | |
| "grad_norm": 7.74640417098999, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0845, | |
| "mean_token_accuracy": 0.9586715698242188, | |
| "num_tokens": 110323507.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.47086870670318604, | |
| "epoch": 21.1, | |
| "grad_norm": 4.6416754722595215, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0808, | |
| "mean_token_accuracy": 0.9621498584747314, | |
| "num_tokens": 110585553.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.4706752896308899, | |
| "epoch": 21.15, | |
| "grad_norm": 4.6220703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1057, | |
| "mean_token_accuracy": 0.952162504196167, | |
| "num_tokens": 110847573.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.4702892303466797, | |
| "epoch": 21.2, | |
| "grad_norm": 4.489948272705078, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0966, | |
| "mean_token_accuracy": 0.9588235020637512, | |
| "num_tokens": 111109630.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.46930834650993347, | |
| "epoch": 21.25, | |
| "grad_norm": 3.6754980087280273, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0863, | |
| "mean_token_accuracy": 0.9629878997802734, | |
| "num_tokens": 111371709.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.4697002172470093, | |
| "epoch": 21.3, | |
| "grad_norm": 4.992099285125732, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0925, | |
| "mean_token_accuracy": 0.9623864889144897, | |
| "num_tokens": 111633745.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.469596803188324, | |
| "epoch": 21.35, | |
| "grad_norm": 5.482630729675293, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0905, | |
| "mean_token_accuracy": 0.9579145908355713, | |
| "num_tokens": 111895798.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.47250843048095703, | |
| "epoch": 21.4, | |
| "grad_norm": 4.3867716789245605, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0899, | |
| "mean_token_accuracy": 0.9563699960708618, | |
| "num_tokens": 112157846.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.4697571396827698, | |
| "epoch": 21.45, | |
| "grad_norm": 4.48779296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0962, | |
| "mean_token_accuracy": 0.9523077011108398, | |
| "num_tokens": 112419937.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.47207871079444885, | |
| "epoch": 21.5, | |
| "grad_norm": 4.785567760467529, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.9592496752738953, | |
| "num_tokens": 112681977.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.47401899099349976, | |
| "epoch": 21.55, | |
| "grad_norm": 4.775023460388184, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0924, | |
| "mean_token_accuracy": 0.9604700803756714, | |
| "num_tokens": 112944013.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.4714542031288147, | |
| "epoch": 21.6, | |
| "grad_norm": 3.748880624771118, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0885, | |
| "mean_token_accuracy": 0.9565749168395996, | |
| "num_tokens": 113206076.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.4722508192062378, | |
| "epoch": 21.65, | |
| "grad_norm": 4.005458831787109, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0837, | |
| "mean_token_accuracy": 0.9660633206367493, | |
| "num_tokens": 113468107.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.47126466035842896, | |
| "epoch": 21.7, | |
| "grad_norm": 4.053618431091309, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0888, | |
| "mean_token_accuracy": 0.9616148471832275, | |
| "num_tokens": 113730145.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.4720988869667053, | |
| "epoch": 21.75, | |
| "grad_norm": 3.8416616916656494, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0855, | |
| "mean_token_accuracy": 0.9624871611595154, | |
| "num_tokens": 113992189.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.4716281294822693, | |
| "epoch": 21.8, | |
| "grad_norm": 4.562581539154053, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0901, | |
| "mean_token_accuracy": 0.9555829763412476, | |
| "num_tokens": 114254238.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.4716986119747162, | |
| "epoch": 21.85, | |
| "grad_norm": 4.10395622253418, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0911, | |
| "mean_token_accuracy": 0.9650474190711975, | |
| "num_tokens": 114516321.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.47269925475120544, | |
| "epoch": 21.9, | |
| "grad_norm": 4.068876266479492, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0828, | |
| "mean_token_accuracy": 0.9637249708175659, | |
| "num_tokens": 114778365.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.47152841091156006, | |
| "epoch": 21.95, | |
| "grad_norm": 5.2423095703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0848, | |
| "mean_token_accuracy": 0.9640921354293823, | |
| "num_tokens": 115040434.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.47160810232162476, | |
| "epoch": 22.0, | |
| "grad_norm": 5.08480167388916, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0932, | |
| "mean_token_accuracy": 0.9635722637176514, | |
| "num_tokens": 115302465.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_entropy": 0.4727582335472107, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9119623899459839, | |
| "eval_num_tokens": 115302465.0, | |
| "eval_runtime": 0.5604, | |
| "eval_samples_per_second": 446.076, | |
| "eval_steps_per_second": 1.784, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.47174668312072754, | |
| "epoch": 22.05, | |
| "grad_norm": 3.2800099849700928, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0718, | |
| "mean_token_accuracy": 0.9681742191314697, | |
| "num_tokens": 115564519.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.47139155864715576, | |
| "epoch": 22.1, | |
| "grad_norm": 4.268564701080322, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0982, | |
| "mean_token_accuracy": 0.9564149975776672, | |
| "num_tokens": 115826576.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.47082388401031494, | |
| "epoch": 22.15, | |
| "grad_norm": 4.853943824768066, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0878, | |
| "mean_token_accuracy": 0.9619899392127991, | |
| "num_tokens": 116088628.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.4726426601409912, | |
| "epoch": 22.2, | |
| "grad_norm": 3.3755438327789307, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0843, | |
| "mean_token_accuracy": 0.9618708491325378, | |
| "num_tokens": 116350660.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.47002309560775757, | |
| "epoch": 22.25, | |
| "grad_norm": 3.2275404930114746, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0832, | |
| "mean_token_accuracy": 0.9626911282539368, | |
| "num_tokens": 116612690.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.4685373902320862, | |
| "epoch": 22.3, | |
| "grad_norm": 5.329719066619873, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0913, | |
| "mean_token_accuracy": 0.9623402953147888, | |
| "num_tokens": 116874737.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.4670417010784149, | |
| "epoch": 22.35, | |
| "grad_norm": 3.7413110733032227, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0838, | |
| "mean_token_accuracy": 0.9646719694137573, | |
| "num_tokens": 117136816.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.46998491883277893, | |
| "epoch": 22.4, | |
| "grad_norm": 2.7414612770080566, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0775, | |
| "mean_token_accuracy": 0.9648514986038208, | |
| "num_tokens": 117398835.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.4664579927921295, | |
| "epoch": 22.45, | |
| "grad_norm": 4.6384406089782715, | |
| "learning_rate": 1e-06, | |
| "loss": 0.073, | |
| "mean_token_accuracy": 0.9710144996643066, | |
| "num_tokens": 117660898.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.46621328592300415, | |
| "epoch": 22.5, | |
| "grad_norm": 5.154250144958496, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0866, | |
| "mean_token_accuracy": 0.9636255502700806, | |
| "num_tokens": 117922970.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.46722596883773804, | |
| "epoch": 22.55, | |
| "grad_norm": 6.065870761871338, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0897, | |
| "mean_token_accuracy": 0.9548147916793823, | |
| "num_tokens": 118185045.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.4652860760688782, | |
| "epoch": 22.6, | |
| "grad_norm": 4.755091190338135, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0709, | |
| "mean_token_accuracy": 0.9737588763237, | |
| "num_tokens": 118447114.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.4672521650791168, | |
| "epoch": 22.65, | |
| "grad_norm": 4.636857509613037, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0871, | |
| "mean_token_accuracy": 0.9595959782600403, | |
| "num_tokens": 118709192.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.46378546953201294, | |
| "epoch": 22.7, | |
| "grad_norm": 6.048754692077637, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0929, | |
| "mean_token_accuracy": 0.9589357972145081, | |
| "num_tokens": 118971217.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.4664173722267151, | |
| "epoch": 22.75, | |
| "grad_norm": 4.586204528808594, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0926, | |
| "mean_token_accuracy": 0.9641411304473877, | |
| "num_tokens": 119233209.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.46483659744262695, | |
| "epoch": 22.8, | |
| "grad_norm": 5.882786750793457, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0874, | |
| "mean_token_accuracy": 0.9628571271896362, | |
| "num_tokens": 119495268.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.46636223793029785, | |
| "epoch": 22.85, | |
| "grad_norm": 8.683144569396973, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1024, | |
| "mean_token_accuracy": 0.9540635943412781, | |
| "num_tokens": 119757295.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.4629400968551636, | |
| "epoch": 22.9, | |
| "grad_norm": 8.564299583435059, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1078, | |
| "mean_token_accuracy": 0.9609755873680115, | |
| "num_tokens": 120019382.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.46590715646743774, | |
| "epoch": 22.95, | |
| "grad_norm": 4.7376275062561035, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.9637865424156189, | |
| "num_tokens": 120281450.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.466198205947876, | |
| "epoch": 23.0, | |
| "grad_norm": 4.441730976104736, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0738, | |
| "mean_token_accuracy": 0.9684579372406006, | |
| "num_tokens": 120543491.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_entropy": 0.46709567308425903, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9045698642730713, | |
| "eval_num_tokens": 120543491.0, | |
| "eval_runtime": 0.6287, | |
| "eval_samples_per_second": 397.673, | |
| "eval_steps_per_second": 1.591, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.46561282873153687, | |
| "epoch": 23.05, | |
| "grad_norm": 3.567425012588501, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0809, | |
| "mean_token_accuracy": 0.9651972055435181, | |
| "num_tokens": 120805544.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.46857595443725586, | |
| "epoch": 23.1, | |
| "grad_norm": 4.191952228546143, | |
| "learning_rate": 1e-06, | |
| "loss": 0.082, | |
| "mean_token_accuracy": 0.9606382846832275, | |
| "num_tokens": 121067588.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.4691683053970337, | |
| "epoch": 23.15, | |
| "grad_norm": 5.60888671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0849, | |
| "mean_token_accuracy": 0.9594594836235046, | |
| "num_tokens": 121329651.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.46791988611221313, | |
| "epoch": 23.2, | |
| "grad_norm": 5.512171745300293, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0833, | |
| "mean_token_accuracy": 0.9642616510391235, | |
| "num_tokens": 121591694.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.47018688917160034, | |
| "epoch": 23.25, | |
| "grad_norm": 6.818735122680664, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0927, | |
| "mean_token_accuracy": 0.9616252779960632, | |
| "num_tokens": 121853781.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.4723391532897949, | |
| "epoch": 23.3, | |
| "grad_norm": 3.6118452548980713, | |
| "learning_rate": 1e-06, | |
| "loss": 0.076, | |
| "mean_token_accuracy": 0.9650793671607971, | |
| "num_tokens": 122115769.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.4676530957221985, | |
| "epoch": 23.35, | |
| "grad_norm": 4.851842880249023, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0862, | |
| "mean_token_accuracy": 0.9640449285507202, | |
| "num_tokens": 122377862.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.46646612882614136, | |
| "epoch": 23.4, | |
| "grad_norm": 5.6723175048828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9647058844566345, | |
| "num_tokens": 122639914.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.46782416105270386, | |
| "epoch": 23.45, | |
| "grad_norm": 6.064637184143066, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0868, | |
| "mean_token_accuracy": 0.9623545408248901, | |
| "num_tokens": 122901968.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.4674442410469055, | |
| "epoch": 23.5, | |
| "grad_norm": 5.13816499710083, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0889, | |
| "mean_token_accuracy": 0.9569685459136963, | |
| "num_tokens": 123164019.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.4655379354953766, | |
| "epoch": 23.55, | |
| "grad_norm": 5.55079984664917, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0855, | |
| "mean_token_accuracy": 0.9623115658760071, | |
| "num_tokens": 123426103.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.4674234986305237, | |
| "epoch": 23.6, | |
| "grad_norm": 4.348241806030273, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0795, | |
| "mean_token_accuracy": 0.9644389748573303, | |
| "num_tokens": 123688162.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.4678855538368225, | |
| "epoch": 23.65, | |
| "grad_norm": 4.124541282653809, | |
| "learning_rate": 1e-06, | |
| "loss": 0.065, | |
| "mean_token_accuracy": 0.9705128073692322, | |
| "num_tokens": 123950216.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.4669386148452759, | |
| "epoch": 23.7, | |
| "grad_norm": 4.676552772521973, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0745, | |
| "mean_token_accuracy": 0.970704197883606, | |
| "num_tokens": 124212287.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.46618372201919556, | |
| "epoch": 23.75, | |
| "grad_norm": 4.589600086212158, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0823, | |
| "mean_token_accuracy": 0.9622377753257751, | |
| "num_tokens": 124474343.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.4674683213233948, | |
| "epoch": 23.8, | |
| "grad_norm": 5.328636646270752, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0744, | |
| "mean_token_accuracy": 0.9667332172393799, | |
| "num_tokens": 124736406.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.4664192497730255, | |
| "epoch": 23.85, | |
| "grad_norm": 6.037339210510254, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0935, | |
| "mean_token_accuracy": 0.9550173282623291, | |
| "num_tokens": 124998444.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.46771666407585144, | |
| "epoch": 23.9, | |
| "grad_norm": 5.8049468994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0897, | |
| "mean_token_accuracy": 0.9661781191825867, | |
| "num_tokens": 125260481.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.46755754947662354, | |
| "epoch": 23.95, | |
| "grad_norm": 6.086460113525391, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1008, | |
| "mean_token_accuracy": 0.9578189253807068, | |
| "num_tokens": 125522490.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.4669610261917114, | |
| "epoch": 24.0, | |
| "grad_norm": 4.249874114990234, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0785, | |
| "mean_token_accuracy": 0.9679803252220154, | |
| "num_tokens": 125784509.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_entropy": 0.46940919756889343, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9126344323158264, | |
| "eval_num_tokens": 125784509.0, | |
| "eval_runtime": 0.5641, | |
| "eval_samples_per_second": 443.21, | |
| "eval_steps_per_second": 1.773, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.4667162299156189, | |
| "epoch": 24.05, | |
| "grad_norm": 3.9976305961608887, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0868, | |
| "mean_token_accuracy": 0.9649001955986023, | |
| "num_tokens": 126046588.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.46550101041793823, | |
| "epoch": 24.1, | |
| "grad_norm": 3.9286561012268066, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0735, | |
| "mean_token_accuracy": 0.968622088432312, | |
| "num_tokens": 126308647.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.46659862995147705, | |
| "epoch": 24.15, | |
| "grad_norm": 4.12390661239624, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0806, | |
| "mean_token_accuracy": 0.9664310812950134, | |
| "num_tokens": 126570674.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.4668567180633545, | |
| "epoch": 24.2, | |
| "grad_norm": 4.626502990722656, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0851, | |
| "mean_token_accuracy": 0.9562251567840576, | |
| "num_tokens": 126832696.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.4655776619911194, | |
| "epoch": 24.25, | |
| "grad_norm": 6.302225589752197, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0797, | |
| "mean_token_accuracy": 0.9599271416664124, | |
| "num_tokens": 127094738.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.4636232852935791, | |
| "epoch": 24.3, | |
| "grad_norm": 6.734894752502441, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0838, | |
| "mean_token_accuracy": 0.9588276147842407, | |
| "num_tokens": 127356764.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.4671253561973572, | |
| "epoch": 24.35, | |
| "grad_norm": 4.378500938415527, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0689, | |
| "mean_token_accuracy": 0.9688196182250977, | |
| "num_tokens": 127618790.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.4644678235054016, | |
| "epoch": 24.4, | |
| "grad_norm": 5.7774858474731445, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0958, | |
| "mean_token_accuracy": 0.9631399512290955, | |
| "num_tokens": 127880881.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.4653833210468292, | |
| "epoch": 24.45, | |
| "grad_norm": 3.9272964000701904, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0625, | |
| "mean_token_accuracy": 0.9740871787071228, | |
| "num_tokens": 128142908.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.4655180275440216, | |
| "epoch": 24.5, | |
| "grad_norm": 4.535080909729004, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0791, | |
| "mean_token_accuracy": 0.969737708568573, | |
| "num_tokens": 128404955.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.46757546067237854, | |
| "epoch": 24.55, | |
| "grad_norm": 4.897022724151611, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0694, | |
| "mean_token_accuracy": 0.9702759981155396, | |
| "num_tokens": 128667003.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.466405987739563, | |
| "epoch": 24.6, | |
| "grad_norm": 4.4283037185668945, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0849, | |
| "mean_token_accuracy": 0.9647526741027832, | |
| "num_tokens": 128929025.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.46565863490104675, | |
| "epoch": 24.65, | |
| "grad_norm": 5.80818510055542, | |
| "learning_rate": 1e-06, | |
| "loss": 0.082, | |
| "mean_token_accuracy": 0.9615384340286255, | |
| "num_tokens": 129191087.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.4646396338939667, | |
| "epoch": 24.7, | |
| "grad_norm": 5.854940891265869, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0726, | |
| "mean_token_accuracy": 0.9698593616485596, | |
| "num_tokens": 129453140.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.46409285068511963, | |
| "epoch": 24.75, | |
| "grad_norm": 4.3521552085876465, | |
| "learning_rate": 1e-06, | |
| "loss": 0.071, | |
| "mean_token_accuracy": 0.9737654328346252, | |
| "num_tokens": 129715194.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.46351325511932373, | |
| "epoch": 24.8, | |
| "grad_norm": 10.49264144897461, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0912, | |
| "mean_token_accuracy": 0.9609375, | |
| "num_tokens": 129977261.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.4629173278808594, | |
| "epoch": 24.85, | |
| "grad_norm": 5.705246448516846, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0736, | |
| "mean_token_accuracy": 0.9654731750488281, | |
| "num_tokens": 130239319.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.4622166156768799, | |
| "epoch": 24.9, | |
| "grad_norm": 4.9481706619262695, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0768, | |
| "mean_token_accuracy": 0.9647132754325867, | |
| "num_tokens": 130501400.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.4619133770465851, | |
| "epoch": 24.95, | |
| "grad_norm": 5.516783714294434, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0972, | |
| "mean_token_accuracy": 0.9551239013671875, | |
| "num_tokens": 130763486.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.46198371052742004, | |
| "epoch": 25.0, | |
| "grad_norm": 4.832233905792236, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0775, | |
| "mean_token_accuracy": 0.9691321849822998, | |
| "num_tokens": 131025532.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_entropy": 0.464277058839798, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9025537371635437, | |
| "eval_num_tokens": 131025532.0, | |
| "eval_runtime": 0.564, | |
| "eval_samples_per_second": 443.297, | |
| "eval_steps_per_second": 1.773, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.4616357684135437, | |
| "epoch": 25.05, | |
| "grad_norm": 5.327179431915283, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0762, | |
| "mean_token_accuracy": 0.965786337852478, | |
| "num_tokens": 131287560.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.4612486958503723, | |
| "epoch": 25.1, | |
| "grad_norm": 5.3239426612854, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0772, | |
| "mean_token_accuracy": 0.9653465151786804, | |
| "num_tokens": 131549633.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.4629809260368347, | |
| "epoch": 25.15, | |
| "grad_norm": 4.609165191650391, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0789, | |
| "mean_token_accuracy": 0.9663962721824646, | |
| "num_tokens": 131811688.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.46398937702178955, | |
| "epoch": 25.2, | |
| "grad_norm": 4.2075700759887695, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0781, | |
| "mean_token_accuracy": 0.9686935544013977, | |
| "num_tokens": 132073678.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.46296095848083496, | |
| "epoch": 25.25, | |
| "grad_norm": 4.920988082885742, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0642, | |
| "mean_token_accuracy": 0.9684147834777832, | |
| "num_tokens": 132335718.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.45965808629989624, | |
| "epoch": 25.3, | |
| "grad_norm": 3.9255125522613525, | |
| "learning_rate": 1e-06, | |
| "loss": 0.068, | |
| "mean_token_accuracy": 0.9701306819915771, | |
| "num_tokens": 132597786.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.45947885513305664, | |
| "epoch": 25.35, | |
| "grad_norm": 4.092470169067383, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0829, | |
| "mean_token_accuracy": 0.9691497087478638, | |
| "num_tokens": 132859840.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.45919084548950195, | |
| "epoch": 25.4, | |
| "grad_norm": 4.688226699829102, | |
| "learning_rate": 1e-06, | |
| "loss": 0.071, | |
| "mean_token_accuracy": 0.9719813466072083, | |
| "num_tokens": 133121899.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.4593312442302704, | |
| "epoch": 25.45, | |
| "grad_norm": 4.132238388061523, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0813, | |
| "mean_token_accuracy": 0.9599350094795227, | |
| "num_tokens": 133383910.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.45791468024253845, | |
| "epoch": 25.5, | |
| "grad_norm": 3.8919591903686523, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0658, | |
| "mean_token_accuracy": 0.9714285731315613, | |
| "num_tokens": 133645956.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.45953884720802307, | |
| "epoch": 25.55, | |
| "grad_norm": 6.311083793640137, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0643, | |
| "mean_token_accuracy": 0.9748603105545044, | |
| "num_tokens": 133908014.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.4577752947807312, | |
| "epoch": 25.6, | |
| "grad_norm": 11.283148765563965, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0902, | |
| "mean_token_accuracy": 0.9606496095657349, | |
| "num_tokens": 134170076.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.45760929584503174, | |
| "epoch": 25.65, | |
| "grad_norm": 4.889045715332031, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0731, | |
| "mean_token_accuracy": 0.9718985557556152, | |
| "num_tokens": 134432128.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.45951539278030396, | |
| "epoch": 25.7, | |
| "grad_norm": 4.273900508880615, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0713, | |
| "mean_token_accuracy": 0.9665513038635254, | |
| "num_tokens": 134694191.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.458004355430603, | |
| "epoch": 25.75, | |
| "grad_norm": 4.518304347991943, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0696, | |
| "mean_token_accuracy": 0.9705690145492554, | |
| "num_tokens": 134956247.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.4571762979030609, | |
| "epoch": 25.8, | |
| "grad_norm": 5.303156852722168, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0808, | |
| "mean_token_accuracy": 0.9643678069114685, | |
| "num_tokens": 135218283.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.457592248916626, | |
| "epoch": 25.85, | |
| "grad_norm": 4.145455837249756, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0708, | |
| "mean_token_accuracy": 0.9692658185958862, | |
| "num_tokens": 135480369.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.4571569561958313, | |
| "epoch": 25.9, | |
| "grad_norm": 5.37058162689209, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0775, | |
| "mean_token_accuracy": 0.9683794379234314, | |
| "num_tokens": 135742414.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.45553267002105713, | |
| "epoch": 25.95, | |
| "grad_norm": 6.640298843383789, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0802, | |
| "mean_token_accuracy": 0.9627623558044434, | |
| "num_tokens": 136004484.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.45771491527557373, | |
| "epoch": 26.0, | |
| "grad_norm": 5.2289958000183105, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0812, | |
| "mean_token_accuracy": 0.9678688645362854, | |
| "num_tokens": 136266536.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_entropy": 0.45934179425239563, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9032257795333862, | |
| "eval_num_tokens": 136266536.0, | |
| "eval_runtime": 0.563, | |
| "eval_samples_per_second": 444.014, | |
| "eval_steps_per_second": 1.776, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.4587504267692566, | |
| "epoch": 26.05, | |
| "grad_norm": 3.8674097061157227, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0728, | |
| "mean_token_accuracy": 0.9732477068901062, | |
| "num_tokens": 136528569.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.45837363600730896, | |
| "epoch": 26.1, | |
| "grad_norm": 5.667929172515869, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0698, | |
| "mean_token_accuracy": 0.9693174958229065, | |
| "num_tokens": 136790660.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.45661094784736633, | |
| "epoch": 26.15, | |
| "grad_norm": 5.02635383605957, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0727, | |
| "mean_token_accuracy": 0.9678813815116882, | |
| "num_tokens": 137052707.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.4580841362476349, | |
| "epoch": 26.2, | |
| "grad_norm": 4.592870712280273, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0736, | |
| "mean_token_accuracy": 0.9665246605873108, | |
| "num_tokens": 137314778.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.45721596479415894, | |
| "epoch": 26.25, | |
| "grad_norm": 3.333099603652954, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0595, | |
| "mean_token_accuracy": 0.9748427867889404, | |
| "num_tokens": 137576868.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.4584483504295349, | |
| "epoch": 26.3, | |
| "grad_norm": 4.201588153839111, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0764, | |
| "mean_token_accuracy": 0.9658351540565491, | |
| "num_tokens": 137838876.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.45824259519577026, | |
| "epoch": 26.35, | |
| "grad_norm": 4.535568714141846, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0677, | |
| "mean_token_accuracy": 0.9732847809791565, | |
| "num_tokens": 138100918.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.45829930901527405, | |
| "epoch": 26.4, | |
| "grad_norm": 7.854061603546143, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0634, | |
| "mean_token_accuracy": 0.9765415787696838, | |
| "num_tokens": 138362937.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.4546545743942261, | |
| "epoch": 26.45, | |
| "grad_norm": 10.19962215423584, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0741, | |
| "mean_token_accuracy": 0.9672130942344666, | |
| "num_tokens": 138625004.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.4562210440635681, | |
| "epoch": 26.5, | |
| "grad_norm": 7.326644420623779, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0822, | |
| "mean_token_accuracy": 0.9659023880958557, | |
| "num_tokens": 138887034.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.4569784104824066, | |
| "epoch": 26.55, | |
| "grad_norm": 4.9447736740112305, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0741, | |
| "mean_token_accuracy": 0.9705063700675964, | |
| "num_tokens": 139149061.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.45219576358795166, | |
| "epoch": 26.6, | |
| "grad_norm": 3.8060805797576904, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0724, | |
| "mean_token_accuracy": 0.9715009331703186, | |
| "num_tokens": 139411134.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.45544517040252686, | |
| "epoch": 26.65, | |
| "grad_norm": 6.335866928100586, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0774, | |
| "mean_token_accuracy": 0.9698432087898254, | |
| "num_tokens": 139673154.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.4518481492996216, | |
| "epoch": 26.7, | |
| "grad_norm": 6.290351867675781, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0833, | |
| "mean_token_accuracy": 0.9635453820228577, | |
| "num_tokens": 139935245.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.45323824882507324, | |
| "epoch": 26.75, | |
| "grad_norm": 7.986852169036865, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0808, | |
| "mean_token_accuracy": 0.9629629850387573, | |
| "num_tokens": 140197320.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.45260536670684814, | |
| "epoch": 26.8, | |
| "grad_norm": 6.8090105056762695, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0767, | |
| "mean_token_accuracy": 0.9647606611251831, | |
| "num_tokens": 140459384.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.45437586307525635, | |
| "epoch": 26.85, | |
| "grad_norm": 5.623941898345947, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0723, | |
| "mean_token_accuracy": 0.9687874913215637, | |
| "num_tokens": 140721412.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.45372387766838074, | |
| "epoch": 26.9, | |
| "grad_norm": 6.018904209136963, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0726, | |
| "mean_token_accuracy": 0.96875, | |
| "num_tokens": 140983473.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.4547538161277771, | |
| "epoch": 26.95, | |
| "grad_norm": 4.399332046508789, | |
| "learning_rate": 1e-06, | |
| "loss": 0.066, | |
| "mean_token_accuracy": 0.9699872136116028, | |
| "num_tokens": 141245533.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.4539608359336853, | |
| "epoch": 27.0, | |
| "grad_norm": 4.773989677429199, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0754, | |
| "mean_token_accuracy": 0.9662853479385376, | |
| "num_tokens": 141507556.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_entropy": 0.4568031132221222, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9045698642730713, | |
| "eval_num_tokens": 141507556.0, | |
| "eval_runtime": 0.5608, | |
| "eval_samples_per_second": 445.81, | |
| "eval_steps_per_second": 1.783, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.4549258053302765, | |
| "epoch": 27.05, | |
| "grad_norm": 5.992208957672119, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0696, | |
| "mean_token_accuracy": 0.9692832827568054, | |
| "num_tokens": 141769581.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.45245492458343506, | |
| "epoch": 27.1, | |
| "grad_norm": 5.600193500518799, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0684, | |
| "mean_token_accuracy": 0.9731225371360779, | |
| "num_tokens": 142031637.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.453826367855072, | |
| "epoch": 27.15, | |
| "grad_norm": 6.150087356567383, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0743, | |
| "mean_token_accuracy": 0.9659667015075684, | |
| "num_tokens": 142293677.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.45176124572753906, | |
| "epoch": 27.2, | |
| "grad_norm": 5.019855976104736, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0599, | |
| "mean_token_accuracy": 0.9740341901779175, | |
| "num_tokens": 142555717.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.45249661803245544, | |
| "epoch": 27.25, | |
| "grad_norm": 3.5191256999969482, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0639, | |
| "mean_token_accuracy": 0.9764208197593689, | |
| "num_tokens": 142817799.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.45174354314804077, | |
| "epoch": 27.3, | |
| "grad_norm": 5.357618808746338, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0751, | |
| "mean_token_accuracy": 0.9682329297065735, | |
| "num_tokens": 143079903.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.4494438171386719, | |
| "epoch": 27.35, | |
| "grad_norm": 3.706282377243042, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0693, | |
| "mean_token_accuracy": 0.9701896905899048, | |
| "num_tokens": 143341978.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.4479014277458191, | |
| "epoch": 27.4, | |
| "grad_norm": 7.060003280639648, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0689, | |
| "mean_token_accuracy": 0.973009467124939, | |
| "num_tokens": 143604053.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.44879603385925293, | |
| "epoch": 27.45, | |
| "grad_norm": 6.722444534301758, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0649, | |
| "mean_token_accuracy": 0.974518358707428, | |
| "num_tokens": 143866123.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.4483737051486969, | |
| "epoch": 27.5, | |
| "grad_norm": 6.251794815063477, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0762, | |
| "mean_token_accuracy": 0.9707950353622437, | |
| "num_tokens": 144128136.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.4492543339729309, | |
| "epoch": 27.55, | |
| "grad_norm": 4.857920169830322, | |
| "learning_rate": 1e-06, | |
| "loss": 0.059, | |
| "mean_token_accuracy": 0.973440408706665, | |
| "num_tokens": 144390216.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.448012113571167, | |
| "epoch": 27.6, | |
| "grad_norm": 4.787059307098389, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0687, | |
| "mean_token_accuracy": 0.968769907951355, | |
| "num_tokens": 144652246.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.4510306119918823, | |
| "epoch": 27.65, | |
| "grad_norm": 7.7825727462768555, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0957, | |
| "mean_token_accuracy": 0.9633767604827881, | |
| "num_tokens": 144914285.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.4503524899482727, | |
| "epoch": 27.7, | |
| "grad_norm": 6.944081783294678, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0658, | |
| "mean_token_accuracy": 0.9696066975593567, | |
| "num_tokens": 145176325.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.45102667808532715, | |
| "epoch": 27.75, | |
| "grad_norm": 6.495534896850586, | |
| "learning_rate": 1e-06, | |
| "loss": 0.066, | |
| "mean_token_accuracy": 0.9710467457771301, | |
| "num_tokens": 145438351.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.45106881856918335, | |
| "epoch": 27.8, | |
| "grad_norm": 5.490335464477539, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0585, | |
| "mean_token_accuracy": 0.9763991832733154, | |
| "num_tokens": 145700394.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.4523155689239502, | |
| "epoch": 27.85, | |
| "grad_norm": 5.689394950866699, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0652, | |
| "mean_token_accuracy": 0.9750356674194336, | |
| "num_tokens": 145962455.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.4528997540473938, | |
| "epoch": 27.9, | |
| "grad_norm": 5.096836090087891, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0661, | |
| "mean_token_accuracy": 0.9740124940872192, | |
| "num_tokens": 146224477.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.4539637267589569, | |
| "epoch": 27.95, | |
| "grad_norm": 6.182263374328613, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0849, | |
| "mean_token_accuracy": 0.9613526463508606, | |
| "num_tokens": 146486519.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.4534997344017029, | |
| "epoch": 28.0, | |
| "grad_norm": 4.790921688079834, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0804, | |
| "mean_token_accuracy": 0.9655537605285645, | |
| "num_tokens": 146748570.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_entropy": 0.4555630087852478, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9032257795333862, | |
| "eval_num_tokens": 146748570.0, | |
| "eval_runtime": 0.563, | |
| "eval_samples_per_second": 444.059, | |
| "eval_steps_per_second": 1.776, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.4534584879875183, | |
| "epoch": 28.05, | |
| "grad_norm": 11.155506134033203, | |
| "learning_rate": 1e-06, | |
| "loss": 0.078, | |
| "mean_token_accuracy": 0.9656893610954285, | |
| "num_tokens": 147010634.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.45180657505989075, | |
| "epoch": 28.1, | |
| "grad_norm": 5.834336280822754, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0753, | |
| "mean_token_accuracy": 0.9643248915672302, | |
| "num_tokens": 147272686.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.44956350326538086, | |
| "epoch": 28.15, | |
| "grad_norm": 5.324082851409912, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0641, | |
| "mean_token_accuracy": 0.9707057476043701, | |
| "num_tokens": 147534715.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.4504123628139496, | |
| "epoch": 28.2, | |
| "grad_norm": 5.74534273147583, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0716, | |
| "mean_token_accuracy": 0.9698461294174194, | |
| "num_tokens": 147796768.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.450857013463974, | |
| "epoch": 28.25, | |
| "grad_norm": 7.693669319152832, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0629, | |
| "mean_token_accuracy": 0.9717868566513062, | |
| "num_tokens": 148058824.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.4519267976284027, | |
| "epoch": 28.3, | |
| "grad_norm": 6.2823662757873535, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0635, | |
| "mean_token_accuracy": 0.9744499921798706, | |
| "num_tokens": 148320892.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.45379358530044556, | |
| "epoch": 28.35, | |
| "grad_norm": 4.106582164764404, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0595, | |
| "mean_token_accuracy": 0.9773442149162292, | |
| "num_tokens": 148582909.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.4516447186470032, | |
| "epoch": 28.4, | |
| "grad_norm": 3.7747578620910645, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0601, | |
| "mean_token_accuracy": 0.9765396118164062, | |
| "num_tokens": 148844998.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.4513830542564392, | |
| "epoch": 28.45, | |
| "grad_norm": 6.135379791259766, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0704, | |
| "mean_token_accuracy": 0.9721642136573792, | |
| "num_tokens": 149107061.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.4540482759475708, | |
| "epoch": 28.5, | |
| "grad_norm": 5.572679042816162, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0591, | |
| "mean_token_accuracy": 0.977356493473053, | |
| "num_tokens": 149369124.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.4550696015357971, | |
| "epoch": 28.55, | |
| "grad_norm": 4.970890998840332, | |
| "learning_rate": 1e-06, | |
| "loss": 0.076, | |
| "mean_token_accuracy": 0.9676320552825928, | |
| "num_tokens": 149631148.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.4574969410896301, | |
| "epoch": 28.6, | |
| "grad_norm": 6.8962483406066895, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0753, | |
| "mean_token_accuracy": 0.9676133394241333, | |
| "num_tokens": 149893154.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.4522814452648163, | |
| "epoch": 28.65, | |
| "grad_norm": 7.506436347961426, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0743, | |
| "mean_token_accuracy": 0.96835857629776, | |
| "num_tokens": 150155231.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.45204830169677734, | |
| "epoch": 28.7, | |
| "grad_norm": 3.9133265018463135, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0534, | |
| "mean_token_accuracy": 0.9761350154876709, | |
| "num_tokens": 150417278.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.4525344967842102, | |
| "epoch": 28.75, | |
| "grad_norm": 4.4698967933654785, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0738, | |
| "mean_token_accuracy": 0.9701104760169983, | |
| "num_tokens": 150679278.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.4537837505340576, | |
| "epoch": 28.8, | |
| "grad_norm": 8.1281156539917, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0671, | |
| "mean_token_accuracy": 0.9691211581230164, | |
| "num_tokens": 150941324.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.449785053730011, | |
| "epoch": 28.85, | |
| "grad_norm": 6.921140670776367, | |
| "learning_rate": 1e-06, | |
| "loss": 0.048, | |
| "mean_token_accuracy": 0.9780303239822388, | |
| "num_tokens": 151203402.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.45099085569381714, | |
| "epoch": 28.9, | |
| "grad_norm": 5.108087539672852, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0702, | |
| "mean_token_accuracy": 0.9698275923728943, | |
| "num_tokens": 151465454.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.4535222053527832, | |
| "epoch": 28.95, | |
| "grad_norm": 7.629486560821533, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0768, | |
| "mean_token_accuracy": 0.9671140909194946, | |
| "num_tokens": 151727537.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.4530973434448242, | |
| "epoch": 29.0, | |
| "grad_norm": 6.288832187652588, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0673, | |
| "mean_token_accuracy": 0.9699453711509705, | |
| "num_tokens": 151989594.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_entropy": 0.45486804842948914, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9005376100540161, | |
| "eval_num_tokens": 151989594.0, | |
| "eval_runtime": 0.5674, | |
| "eval_samples_per_second": 440.627, | |
| "eval_steps_per_second": 1.763, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.45344972610473633, | |
| "epoch": 29.05, | |
| "grad_norm": 3.3168556690216064, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0725, | |
| "mean_token_accuracy": 0.9668222069740295, | |
| "num_tokens": 152251621.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.4522762894630432, | |
| "epoch": 29.1, | |
| "grad_norm": 5.237521171569824, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0749, | |
| "mean_token_accuracy": 0.973809540271759, | |
| "num_tokens": 152513705.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.45483410358428955, | |
| "epoch": 29.15, | |
| "grad_norm": 5.303705215454102, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0697, | |
| "mean_token_accuracy": 0.9701678156852722, | |
| "num_tokens": 152775775.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.4540916681289673, | |
| "epoch": 29.2, | |
| "grad_norm": 5.1602630615234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0507, | |
| "mean_token_accuracy": 0.9763142466545105, | |
| "num_tokens": 153037802.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.4509674608707428, | |
| "epoch": 29.25, | |
| "grad_norm": 4.621006965637207, | |
| "learning_rate": 1e-06, | |
| "loss": 0.052, | |
| "mean_token_accuracy": 0.9785344004631042, | |
| "num_tokens": 153299878.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.4527459144592285, | |
| "epoch": 29.3, | |
| "grad_norm": 5.821897506713867, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0538, | |
| "mean_token_accuracy": 0.9764150977134705, | |
| "num_tokens": 153561922.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.4510110914707184, | |
| "epoch": 29.35, | |
| "grad_norm": 9.293543815612793, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0557, | |
| "mean_token_accuracy": 0.9750346541404724, | |
| "num_tokens": 153823957.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.45252758264541626, | |
| "epoch": 29.4, | |
| "grad_norm": 5.370669364929199, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0759, | |
| "mean_token_accuracy": 0.9652448892593384, | |
| "num_tokens": 154086047.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.45324066281318665, | |
| "epoch": 29.45, | |
| "grad_norm": 6.146717071533203, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0568, | |
| "mean_token_accuracy": 0.97549968957901, | |
| "num_tokens": 154348125.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.45057687163352966, | |
| "epoch": 29.5, | |
| "grad_norm": 9.826089859008789, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0622, | |
| "mean_token_accuracy": 0.972470223903656, | |
| "num_tokens": 154610194.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.45304763317108154, | |
| "epoch": 29.55, | |
| "grad_norm": 6.804394721984863, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0695, | |
| "mean_token_accuracy": 0.9685990214347839, | |
| "num_tokens": 154872230.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.4490482807159424, | |
| "epoch": 29.6, | |
| "grad_norm": 8.995187759399414, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0624, | |
| "mean_token_accuracy": 0.9748928546905518, | |
| "num_tokens": 155134258.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.44886791706085205, | |
| "epoch": 29.65, | |
| "grad_norm": 10.23698616027832, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0823, | |
| "mean_token_accuracy": 0.9673491716384888, | |
| "num_tokens": 155396295.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.45058315992355347, | |
| "epoch": 29.7, | |
| "grad_norm": 10.80184555053711, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0688, | |
| "mean_token_accuracy": 0.9761580228805542, | |
| "num_tokens": 155658323.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.4487557113170624, | |
| "epoch": 29.75, | |
| "grad_norm": 7.765211582183838, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0508, | |
| "mean_token_accuracy": 0.9813148975372314, | |
| "num_tokens": 155920361.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.44877922534942627, | |
| "epoch": 29.8, | |
| "grad_norm": 5.086024761199951, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0614, | |
| "mean_token_accuracy": 0.9721935987472534, | |
| "num_tokens": 156182401.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.44867563247680664, | |
| "epoch": 29.85, | |
| "grad_norm": 5.775504112243652, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0674, | |
| "mean_token_accuracy": 0.9715116024017334, | |
| "num_tokens": 156444450.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.4486447274684906, | |
| "epoch": 29.9, | |
| "grad_norm": 9.032340049743652, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0724, | |
| "mean_token_accuracy": 0.9696561098098755, | |
| "num_tokens": 156706493.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.4501435458660126, | |
| "epoch": 29.95, | |
| "grad_norm": 7.640662670135498, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0827, | |
| "mean_token_accuracy": 0.9632047414779663, | |
| "num_tokens": 156968540.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.4509955048561096, | |
| "epoch": 30.0, | |
| "grad_norm": 3.8913021087646484, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0674, | |
| "mean_token_accuracy": 0.9720497131347656, | |
| "num_tokens": 157230603.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_entropy": 0.4519185423851013, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9072580933570862, | |
| "eval_num_tokens": 157230603.0, | |
| "eval_runtime": 0.5621, | |
| "eval_samples_per_second": 444.74, | |
| "eval_steps_per_second": 1.779, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.44891834259033203, | |
| "epoch": 30.05, | |
| "grad_norm": 3.6025571823120117, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0532, | |
| "mean_token_accuracy": 0.9758663177490234, | |
| "num_tokens": 157492680.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.4488842785358429, | |
| "epoch": 30.1, | |
| "grad_norm": 4.095007419586182, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0485, | |
| "mean_token_accuracy": 0.9798251390457153, | |
| "num_tokens": 157754760.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.44925498962402344, | |
| "epoch": 30.15, | |
| "grad_norm": 4.470601558685303, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0502, | |
| "mean_token_accuracy": 0.9817113280296326, | |
| "num_tokens": 158016818.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.4520828127861023, | |
| "epoch": 30.2, | |
| "grad_norm": 6.587667465209961, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0553, | |
| "mean_token_accuracy": 0.9785202741622925, | |
| "num_tokens": 158278889.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.4510212242603302, | |
| "epoch": 30.25, | |
| "grad_norm": 4.29756498336792, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0603, | |
| "mean_token_accuracy": 0.9725936055183411, | |
| "num_tokens": 158540879.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.45233023166656494, | |
| "epoch": 30.3, | |
| "grad_norm": 5.920616149902344, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0639, | |
| "mean_token_accuracy": 0.9738602042198181, | |
| "num_tokens": 158802919.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.44946354627609253, | |
| "epoch": 30.35, | |
| "grad_norm": 8.821480751037598, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0544, | |
| "mean_token_accuracy": 0.9739726185798645, | |
| "num_tokens": 159064939.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.4503335952758789, | |
| "epoch": 30.4, | |
| "grad_norm": 5.011677265167236, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0653, | |
| "mean_token_accuracy": 0.9722222089767456, | |
| "num_tokens": 159326987.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.4475363492965698, | |
| "epoch": 30.45, | |
| "grad_norm": 4.521021842956543, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0573, | |
| "mean_token_accuracy": 0.9772393703460693, | |
| "num_tokens": 159589074.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.4487738609313965, | |
| "epoch": 30.5, | |
| "grad_norm": 4.860763072967529, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0667, | |
| "mean_token_accuracy": 0.9696969985961914, | |
| "num_tokens": 159851152.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.4483632445335388, | |
| "epoch": 30.55, | |
| "grad_norm": 7.282129287719727, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0473, | |
| "mean_token_accuracy": 0.9839181303977966, | |
| "num_tokens": 160113212.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.4463149607181549, | |
| "epoch": 30.6, | |
| "grad_norm": 7.292006969451904, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0877, | |
| "mean_token_accuracy": 0.9573770761489868, | |
| "num_tokens": 160375297.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.44664695858955383, | |
| "epoch": 30.65, | |
| "grad_norm": 6.004032135009766, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0619, | |
| "mean_token_accuracy": 0.9759036302566528, | |
| "num_tokens": 160637336.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.4448843002319336, | |
| "epoch": 30.7, | |
| "grad_norm": 7.444438934326172, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0686, | |
| "mean_token_accuracy": 0.9725457429885864, | |
| "num_tokens": 160899395.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.44413405656814575, | |
| "epoch": 30.75, | |
| "grad_norm": 7.9332804679870605, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0583, | |
| "mean_token_accuracy": 0.9747899174690247, | |
| "num_tokens": 161161469.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.4475647211074829, | |
| "epoch": 30.8, | |
| "grad_norm": 7.166046142578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0595, | |
| "mean_token_accuracy": 0.9761489033699036, | |
| "num_tokens": 161423484.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.44248896837234497, | |
| "epoch": 30.85, | |
| "grad_norm": 4.561134338378906, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0667, | |
| "mean_token_accuracy": 0.9668790102005005, | |
| "num_tokens": 161685515.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.4430382549762726, | |
| "epoch": 30.9, | |
| "grad_norm": 12.0337495803833, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0723, | |
| "mean_token_accuracy": 0.9708737730979919, | |
| "num_tokens": 161947583.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.44385209679603577, | |
| "epoch": 30.95, | |
| "grad_norm": 5.855442523956299, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0771, | |
| "mean_token_accuracy": 0.9631548523902893, | |
| "num_tokens": 162209583.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.4453125596046448, | |
| "epoch": 31.0, | |
| "grad_norm": 9.10976791381836, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0687, | |
| "mean_token_accuracy": 0.96875, | |
| "num_tokens": 162471638.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_entropy": 0.44676893949508667, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9065860509872437, | |
| "eval_num_tokens": 162471638.0, | |
| "eval_runtime": 0.5669, | |
| "eval_samples_per_second": 440.995, | |
| "eval_steps_per_second": 1.764, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.4446421265602112, | |
| "epoch": 31.05, | |
| "grad_norm": 5.643488883972168, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0501, | |
| "mean_token_accuracy": 0.9764078855514526, | |
| "num_tokens": 162733710.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.4460245370864868, | |
| "epoch": 31.1, | |
| "grad_norm": 7.093123912811279, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0643, | |
| "mean_token_accuracy": 0.971222996711731, | |
| "num_tokens": 162995785.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.4435853958129883, | |
| "epoch": 31.15, | |
| "grad_norm": 7.843852519989014, | |
| "learning_rate": 1e-06, | |
| "loss": 0.06, | |
| "mean_token_accuracy": 0.9783491492271423, | |
| "num_tokens": 163257856.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.44595205783843994, | |
| "epoch": 31.2, | |
| "grad_norm": 9.885993003845215, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0622, | |
| "mean_token_accuracy": 0.9701765179634094, | |
| "num_tokens": 163519927.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.4446321129798889, | |
| "epoch": 31.25, | |
| "grad_norm": 9.069074630737305, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0682, | |
| "mean_token_accuracy": 0.9672130942344666, | |
| "num_tokens": 163781974.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.44555115699768066, | |
| "epoch": 31.3, | |
| "grad_norm": 7.41979455947876, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0667, | |
| "mean_token_accuracy": 0.9729869961738586, | |
| "num_tokens": 164043997.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.44311100244522095, | |
| "epoch": 31.35, | |
| "grad_norm": 6.325224876403809, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0643, | |
| "mean_token_accuracy": 0.9761525988578796, | |
| "num_tokens": 164306046.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.44550782442092896, | |
| "epoch": 31.4, | |
| "grad_norm": 5.873880863189697, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0489, | |
| "mean_token_accuracy": 0.9810085296630859, | |
| "num_tokens": 164568133.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.44575005769729614, | |
| "epoch": 31.45, | |
| "grad_norm": 6.150679111480713, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0692, | |
| "mean_token_accuracy": 0.9675675630569458, | |
| "num_tokens": 164830154.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.44503775238990784, | |
| "epoch": 31.5, | |
| "grad_norm": 6.830018520355225, | |
| "learning_rate": 1e-06, | |
| "loss": 0.064, | |
| "mean_token_accuracy": 0.9699599742889404, | |
| "num_tokens": 165092212.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.44527047872543335, | |
| "epoch": 31.55, | |
| "grad_norm": 8.90091323852539, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0623, | |
| "mean_token_accuracy": 0.9720279574394226, | |
| "num_tokens": 165354268.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.4455764591693878, | |
| "epoch": 31.6, | |
| "grad_norm": 10.846599578857422, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0505, | |
| "mean_token_accuracy": 0.9804713726043701, | |
| "num_tokens": 165616280.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.44468122720718384, | |
| "epoch": 31.65, | |
| "grad_norm": 6.496395111083984, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0701, | |
| "mean_token_accuracy": 0.9713010191917419, | |
| "num_tokens": 165878342.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.4464184045791626, | |
| "epoch": 31.7, | |
| "grad_norm": 4.161031723022461, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0486, | |
| "mean_token_accuracy": 0.9813348650932312, | |
| "num_tokens": 166140340.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.4466201961040497, | |
| "epoch": 31.75, | |
| "grad_norm": 4.4769287109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0646, | |
| "mean_token_accuracy": 0.9720670580863953, | |
| "num_tokens": 166402374.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.4480331540107727, | |
| "epoch": 31.8, | |
| "grad_norm": 5.939889430999756, | |
| "learning_rate": 1e-06, | |
| "loss": 0.064, | |
| "mean_token_accuracy": 0.9711799621582031, | |
| "num_tokens": 166664443.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.4440794587135315, | |
| "epoch": 31.85, | |
| "grad_norm": 8.899001121520996, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0615, | |
| "mean_token_accuracy": 0.9747899174690247, | |
| "num_tokens": 166926510.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.4426196217536926, | |
| "epoch": 31.9, | |
| "grad_norm": 7.370424747467041, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0638, | |
| "mean_token_accuracy": 0.9732052683830261, | |
| "num_tokens": 167188553.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.4424576759338379, | |
| "epoch": 31.95, | |
| "grad_norm": 5.7402801513671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0609, | |
| "mean_token_accuracy": 0.9725839495658875, | |
| "num_tokens": 167450605.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.4405357241630554, | |
| "epoch": 32.0, | |
| "grad_norm": 6.917886734008789, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0561, | |
| "mean_token_accuracy": 0.9764705896377563, | |
| "num_tokens": 167712685.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_entropy": 0.44569727778434753, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9086021780967712, | |
| "eval_num_tokens": 167712685.0, | |
| "eval_runtime": 0.5657, | |
| "eval_samples_per_second": 441.926, | |
| "eval_steps_per_second": 1.768, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.44468456506729126, | |
| "epoch": 32.05, | |
| "grad_norm": 3.7619173526763916, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0504, | |
| "mean_token_accuracy": 0.9797172546386719, | |
| "num_tokens": 167974740.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.44157981872558594, | |
| "epoch": 32.1, | |
| "grad_norm": 6.412563323974609, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0458, | |
| "mean_token_accuracy": 0.9774086475372314, | |
| "num_tokens": 168236805.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.44227516651153564, | |
| "epoch": 32.15, | |
| "grad_norm": 7.628910064697266, | |
| "learning_rate": 1e-06, | |
| "loss": 0.065, | |
| "mean_token_accuracy": 0.966926097869873, | |
| "num_tokens": 168498874.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.4421759843826294, | |
| "epoch": 32.2, | |
| "grad_norm": 4.509586811065674, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0453, | |
| "mean_token_accuracy": 0.9816176295280457, | |
| "num_tokens": 168760901.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.44034460186958313, | |
| "epoch": 32.25, | |
| "grad_norm": 4.948095798492432, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0462, | |
| "mean_token_accuracy": 0.9810659289360046, | |
| "num_tokens": 169022986.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.43732625246047974, | |
| "epoch": 32.3, | |
| "grad_norm": 6.055516719818115, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0719, | |
| "mean_token_accuracy": 0.969675600528717, | |
| "num_tokens": 169285063.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.44073206186294556, | |
| "epoch": 32.35, | |
| "grad_norm": 6.052289962768555, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0546, | |
| "mean_token_accuracy": 0.979825496673584, | |
| "num_tokens": 169547127.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.43953657150268555, | |
| "epoch": 32.4, | |
| "grad_norm": 5.309092044830322, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0562, | |
| "mean_token_accuracy": 0.977673351764679, | |
| "num_tokens": 169809191.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.4400402307510376, | |
| "epoch": 32.45, | |
| "grad_norm": 3.462468385696411, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0536, | |
| "mean_token_accuracy": 0.9798488616943359, | |
| "num_tokens": 170071240.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.4371636211872101, | |
| "epoch": 32.5, | |
| "grad_norm": 9.595351219177246, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0584, | |
| "mean_token_accuracy": 0.9729729890823364, | |
| "num_tokens": 170333259.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.43737897276878357, | |
| "epoch": 32.55, | |
| "grad_norm": 6.217532634735107, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0549, | |
| "mean_token_accuracy": 0.9765312075614929, | |
| "num_tokens": 170595335.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.4356423318386078, | |
| "epoch": 32.6, | |
| "grad_norm": 6.676355361938477, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0569, | |
| "mean_token_accuracy": 0.9793530702590942, | |
| "num_tokens": 170857414.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.43704909086227417, | |
| "epoch": 32.65, | |
| "grad_norm": 5.60529899597168, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0594, | |
| "mean_token_accuracy": 0.9732779264450073, | |
| "num_tokens": 171119460.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.43586838245391846, | |
| "epoch": 32.7, | |
| "grad_norm": 9.050461769104004, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0717, | |
| "mean_token_accuracy": 0.9660377502441406, | |
| "num_tokens": 171381511.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.43764546513557434, | |
| "epoch": 32.75, | |
| "grad_norm": 7.719151020050049, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0519, | |
| "mean_token_accuracy": 0.9790164232254028, | |
| "num_tokens": 171643563.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.4377431273460388, | |
| "epoch": 32.8, | |
| "grad_norm": 6.429050922393799, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0613, | |
| "mean_token_accuracy": 0.9767326712608337, | |
| "num_tokens": 171905582.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.4376789927482605, | |
| "epoch": 32.85, | |
| "grad_norm": 7.1417236328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0622, | |
| "mean_token_accuracy": 0.970708429813385, | |
| "num_tokens": 172167610.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.4353598952293396, | |
| "epoch": 32.9, | |
| "grad_norm": 6.490878105163574, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0577, | |
| "mean_token_accuracy": 0.9740012884140015, | |
| "num_tokens": 172429615.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.43777158856391907, | |
| "epoch": 32.95, | |
| "grad_norm": 8.160463333129883, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0696, | |
| "mean_token_accuracy": 0.9702098965644836, | |
| "num_tokens": 172691685.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.43828877806663513, | |
| "epoch": 33.0, | |
| "grad_norm": 7.0130391120910645, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0501, | |
| "mean_token_accuracy": 0.9810486435890198, | |
| "num_tokens": 172953696.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_entropy": 0.4405567944049835, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9025537371635437, | |
| "eval_num_tokens": 172953696.0, | |
| "eval_runtime": 0.5665, | |
| "eval_samples_per_second": 441.287, | |
| "eval_steps_per_second": 1.765, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.4386732578277588, | |
| "epoch": 33.05, | |
| "grad_norm": 4.480530261993408, | |
| "learning_rate": 1e-06, | |
| "loss": 0.062, | |
| "mean_token_accuracy": 0.9707673788070679, | |
| "num_tokens": 173215733.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.4378315806388855, | |
| "epoch": 33.1, | |
| "grad_norm": 8.64880657196045, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0588, | |
| "mean_token_accuracy": 0.9721043109893799, | |
| "num_tokens": 173477810.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.438620388507843, | |
| "epoch": 33.15, | |
| "grad_norm": 6.820461750030518, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0623, | |
| "mean_token_accuracy": 0.973120927810669, | |
| "num_tokens": 173739851.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.4371694326400757, | |
| "epoch": 33.2, | |
| "grad_norm": 5.388397693634033, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0438, | |
| "mean_token_accuracy": 0.9832369685173035, | |
| "num_tokens": 174001910.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.4353873133659363, | |
| "epoch": 33.25, | |
| "grad_norm": 5.447931289672852, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0507, | |
| "mean_token_accuracy": 0.9806560277938843, | |
| "num_tokens": 174263956.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.43813008069992065, | |
| "epoch": 33.3, | |
| "grad_norm": 7.453225612640381, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0583, | |
| "mean_token_accuracy": 0.9732201099395752, | |
| "num_tokens": 174525981.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.4330652356147766, | |
| "epoch": 33.35, | |
| "grad_norm": 4.948639869689941, | |
| "learning_rate": 1e-06, | |
| "loss": 0.055, | |
| "mean_token_accuracy": 0.9784615635871887, | |
| "num_tokens": 174788067.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.43555963039398193, | |
| "epoch": 33.4, | |
| "grad_norm": 9.127644538879395, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0534, | |
| "mean_token_accuracy": 0.9801324605941772, | |
| "num_tokens": 175050118.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.4352026581764221, | |
| "epoch": 33.45, | |
| "grad_norm": 10.331482887268066, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0666, | |
| "mean_token_accuracy": 0.9726110696792603, | |
| "num_tokens": 175312189.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.434938907623291, | |
| "epoch": 33.5, | |
| "grad_norm": 7.918214797973633, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0586, | |
| "mean_token_accuracy": 0.9758656620979309, | |
| "num_tokens": 175574259.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.43676304817199707, | |
| "epoch": 33.55, | |
| "grad_norm": 7.050904750823975, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0636, | |
| "mean_token_accuracy": 0.9727171659469604, | |
| "num_tokens": 175836318.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.4355317950248718, | |
| "epoch": 33.6, | |
| "grad_norm": 3.8058547973632812, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0469, | |
| "mean_token_accuracy": 0.9824120402336121, | |
| "num_tokens": 176098338.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.437080442905426, | |
| "epoch": 33.65, | |
| "grad_norm": 4.726657390594482, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0504, | |
| "mean_token_accuracy": 0.981028139591217, | |
| "num_tokens": 176360400.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.4355364441871643, | |
| "epoch": 33.7, | |
| "grad_norm": 4.205655097961426, | |
| "learning_rate": 1e-06, | |
| "loss": 0.064, | |
| "mean_token_accuracy": 0.9719271659851074, | |
| "num_tokens": 176622443.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.43918710947036743, | |
| "epoch": 33.75, | |
| "grad_norm": 8.278221130371094, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0544, | |
| "mean_token_accuracy": 0.9779179692268372, | |
| "num_tokens": 176884456.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.43781429529190063, | |
| "epoch": 33.8, | |
| "grad_norm": 10.086792945861816, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0579, | |
| "mean_token_accuracy": 0.9812734127044678, | |
| "num_tokens": 177146486.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.4368705749511719, | |
| "epoch": 33.85, | |
| "grad_norm": 7.0400567054748535, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0574, | |
| "mean_token_accuracy": 0.973964512348175, | |
| "num_tokens": 177408538.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.4386277198791504, | |
| "epoch": 33.9, | |
| "grad_norm": 5.393922805786133, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0525, | |
| "mean_token_accuracy": 0.9760109782218933, | |
| "num_tokens": 177670590.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.4350482225418091, | |
| "epoch": 33.95, | |
| "grad_norm": 8.861899375915527, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0541, | |
| "mean_token_accuracy": 0.9747545719146729, | |
| "num_tokens": 177932675.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.43752387166023254, | |
| "epoch": 34.0, | |
| "grad_norm": 8.595000267028809, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0608, | |
| "mean_token_accuracy": 0.9761388301849365, | |
| "num_tokens": 178194717.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_entropy": 0.4389887750148773, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9045698642730713, | |
| "eval_num_tokens": 178194717.0, | |
| "eval_runtime": 0.565, | |
| "eval_samples_per_second": 442.441, | |
| "eval_steps_per_second": 1.77, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.4371822476387024, | |
| "epoch": 34.05, | |
| "grad_norm": 5.267615795135498, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0442, | |
| "mean_token_accuracy": 0.9826968908309937, | |
| "num_tokens": 178456788.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.4385065734386444, | |
| "epoch": 34.1, | |
| "grad_norm": 6.0776567459106445, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0497, | |
| "mean_token_accuracy": 0.9802631735801697, | |
| "num_tokens": 178718822.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.43671929836273193, | |
| "epoch": 34.15, | |
| "grad_norm": 6.3837175369262695, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0593, | |
| "mean_token_accuracy": 0.9744361042976379, | |
| "num_tokens": 178980877.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.43611472845077515, | |
| "epoch": 34.2, | |
| "grad_norm": 8.407622337341309, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0491, | |
| "mean_token_accuracy": 0.9801462888717651, | |
| "num_tokens": 179242922.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.4359371066093445, | |
| "epoch": 34.25, | |
| "grad_norm": 5.882180213928223, | |
| "learning_rate": 1e-06, | |
| "loss": 0.074, | |
| "mean_token_accuracy": 0.9692609310150146, | |
| "num_tokens": 179504978.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.4321635961532593, | |
| "epoch": 34.3, | |
| "grad_norm": 9.319916725158691, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0675, | |
| "mean_token_accuracy": 0.9795918464660645, | |
| "num_tokens": 179767047.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.4343339502811432, | |
| "epoch": 34.35, | |
| "grad_norm": 6.66765832901001, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0455, | |
| "mean_token_accuracy": 0.9825620651245117, | |
| "num_tokens": 180029098.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.43469831347465515, | |
| "epoch": 34.4, | |
| "grad_norm": 6.452228546142578, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0577, | |
| "mean_token_accuracy": 0.9775280952453613, | |
| "num_tokens": 180291151.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.43252041935920715, | |
| "epoch": 34.45, | |
| "grad_norm": 4.9461822509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0601, | |
| "mean_token_accuracy": 0.9745628237724304, | |
| "num_tokens": 180553169.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.43243032693862915, | |
| "epoch": 34.5, | |
| "grad_norm": 13.353845596313477, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0657, | |
| "mean_token_accuracy": 0.9685197472572327, | |
| "num_tokens": 180815222.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.4328831434249878, | |
| "epoch": 34.55, | |
| "grad_norm": 10.329776763916016, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0614, | |
| "mean_token_accuracy": 0.9704106450080872, | |
| "num_tokens": 181077273.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.43350595235824585, | |
| "epoch": 34.6, | |
| "grad_norm": 8.362601280212402, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0581, | |
| "mean_token_accuracy": 0.9745671153068542, | |
| "num_tokens": 181339285.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.43465209007263184, | |
| "epoch": 34.65, | |
| "grad_norm": 4.411050796508789, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0474, | |
| "mean_token_accuracy": 0.980923056602478, | |
| "num_tokens": 181601338.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.4354056715965271, | |
| "epoch": 34.7, | |
| "grad_norm": 4.7629008293151855, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0485, | |
| "mean_token_accuracy": 0.980560302734375, | |
| "num_tokens": 181863383.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.43551138043403625, | |
| "epoch": 34.75, | |
| "grad_norm": 7.455565929412842, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0518, | |
| "mean_token_accuracy": 0.9773609042167664, | |
| "num_tokens": 182125456.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.4353017807006836, | |
| "epoch": 34.8, | |
| "grad_norm": 4.370556354522705, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0517, | |
| "mean_token_accuracy": 0.9772007465362549, | |
| "num_tokens": 182387562.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.4363037943840027, | |
| "epoch": 34.85, | |
| "grad_norm": 5.329375267028809, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0498, | |
| "mean_token_accuracy": 0.978723406791687, | |
| "num_tokens": 182649622.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.43782755732536316, | |
| "epoch": 34.9, | |
| "grad_norm": 8.173315048217773, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0813, | |
| "mean_token_accuracy": 0.9646511673927307, | |
| "num_tokens": 182911639.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.434041827917099, | |
| "epoch": 34.95, | |
| "grad_norm": 7.220884323120117, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0546, | |
| "mean_token_accuracy": 0.9753968119621277, | |
| "num_tokens": 183173690.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.434684693813324, | |
| "epoch": 35.0, | |
| "grad_norm": 5.7697296142578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0538, | |
| "mean_token_accuracy": 0.9744229316711426, | |
| "num_tokens": 183435754.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_entropy": 0.437461256980896, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8998655676841736, | |
| "eval_num_tokens": 183435754.0, | |
| "eval_runtime": 0.5651, | |
| "eval_samples_per_second": 442.394, | |
| "eval_steps_per_second": 1.77, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.435441255569458, | |
| "epoch": 35.05, | |
| "grad_norm": 4.88384485244751, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0589, | |
| "mean_token_accuracy": 0.9778853058815002, | |
| "num_tokens": 183697794.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.4339606463909149, | |
| "epoch": 35.1, | |
| "grad_norm": 4.933961391448975, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0552, | |
| "mean_token_accuracy": 0.9752303957939148, | |
| "num_tokens": 183959859.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.4330012798309326, | |
| "epoch": 35.15, | |
| "grad_norm": 7.595359802246094, | |
| "learning_rate": 1e-06, | |
| "loss": 0.055, | |
| "mean_token_accuracy": 0.9751279950141907, | |
| "num_tokens": 184221885.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.4331626296043396, | |
| "epoch": 35.2, | |
| "grad_norm": 3.9097628593444824, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0441, | |
| "mean_token_accuracy": 0.983565092086792, | |
| "num_tokens": 184483961.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.43158963322639465, | |
| "epoch": 35.25, | |
| "grad_norm": 5.426919937133789, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0534, | |
| "mean_token_accuracy": 0.977968156337738, | |
| "num_tokens": 184745990.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.4347250759601593, | |
| "epoch": 35.3, | |
| "grad_norm": 7.520493030548096, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0506, | |
| "mean_token_accuracy": 0.9777158498764038, | |
| "num_tokens": 185008048.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.4325712323188782, | |
| "epoch": 35.35, | |
| "grad_norm": 6.389200210571289, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0495, | |
| "mean_token_accuracy": 0.9767596125602722, | |
| "num_tokens": 185270114.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.43263205885887146, | |
| "epoch": 35.4, | |
| "grad_norm": 4.24953031539917, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0524, | |
| "mean_token_accuracy": 0.9763349294662476, | |
| "num_tokens": 185532190.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.4333184063434601, | |
| "epoch": 35.45, | |
| "grad_norm": 3.9612417221069336, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0518, | |
| "mean_token_accuracy": 0.9805447459220886, | |
| "num_tokens": 185794266.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.43302151560783386, | |
| "epoch": 35.5, | |
| "grad_norm": 9.319104194641113, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0529, | |
| "mean_token_accuracy": 0.9785074591636658, | |
| "num_tokens": 186056270.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.4321300983428955, | |
| "epoch": 35.55, | |
| "grad_norm": 8.200410842895508, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0483, | |
| "mean_token_accuracy": 0.9809750318527222, | |
| "num_tokens": 186318314.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.43272072076797485, | |
| "epoch": 35.6, | |
| "grad_norm": 3.841181516647339, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0489, | |
| "mean_token_accuracy": 0.977331280708313, | |
| "num_tokens": 186580353.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.4306867718696594, | |
| "epoch": 35.65, | |
| "grad_norm": 4.986248016357422, | |
| "learning_rate": 1e-06, | |
| "loss": 0.061, | |
| "mean_token_accuracy": 0.9757112860679626, | |
| "num_tokens": 186842420.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.43096476793289185, | |
| "epoch": 35.7, | |
| "grad_norm": 6.300476551055908, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0622, | |
| "mean_token_accuracy": 0.9771689772605896, | |
| "num_tokens": 187104492.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.43027129769325256, | |
| "epoch": 35.75, | |
| "grad_norm": 8.516679763793945, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0592, | |
| "mean_token_accuracy": 0.9696202278137207, | |
| "num_tokens": 187366533.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.4307771325111389, | |
| "epoch": 35.8, | |
| "grad_norm": 5.331250190734863, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0524, | |
| "mean_token_accuracy": 0.9772095680236816, | |
| "num_tokens": 187628562.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.43008124828338623, | |
| "epoch": 35.85, | |
| "grad_norm": 5.206639289855957, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0589, | |
| "mean_token_accuracy": 0.9751424193382263, | |
| "num_tokens": 187890591.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.43140071630477905, | |
| "epoch": 35.9, | |
| "grad_norm": 6.906051158905029, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0564, | |
| "mean_token_accuracy": 0.9787104725837708, | |
| "num_tokens": 188152663.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.4307631850242615, | |
| "epoch": 35.95, | |
| "grad_norm": 10.007140159606934, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0484, | |
| "mean_token_accuracy": 0.9796776175498962, | |
| "num_tokens": 188414716.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.4294223189353943, | |
| "epoch": 36.0, | |
| "grad_norm": 4.129380226135254, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0459, | |
| "mean_token_accuracy": 0.980211079120636, | |
| "num_tokens": 188676759.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_entropy": 0.4343888759613037, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9025537371635437, | |
| "eval_num_tokens": 188676759.0, | |
| "eval_runtime": 0.5635, | |
| "eval_samples_per_second": 443.648, | |
| "eval_steps_per_second": 1.775, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.4333025813102722, | |
| "epoch": 36.05, | |
| "grad_norm": 7.116022109985352, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0524, | |
| "mean_token_accuracy": 0.9802095293998718, | |
| "num_tokens": 188938806.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.4314154386520386, | |
| "epoch": 36.1, | |
| "grad_norm": 8.479453086853027, | |
| "learning_rate": 1e-06, | |
| "loss": 0.069, | |
| "mean_token_accuracy": 0.9697193503379822, | |
| "num_tokens": 189200885.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.4306054711341858, | |
| "epoch": 36.15, | |
| "grad_norm": 5.400243282318115, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0532, | |
| "mean_token_accuracy": 0.9769552946090698, | |
| "num_tokens": 189462943.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.4325631856918335, | |
| "epoch": 36.2, | |
| "grad_norm": 4.690549850463867, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0597, | |
| "mean_token_accuracy": 0.9788106679916382, | |
| "num_tokens": 189724999.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.43309372663497925, | |
| "epoch": 36.25, | |
| "grad_norm": 5.27554988861084, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0461, | |
| "mean_token_accuracy": 0.9805699586868286, | |
| "num_tokens": 189987037.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.43083733320236206, | |
| "epoch": 36.3, | |
| "grad_norm": 6.90108060836792, | |
| "learning_rate": 1e-06, | |
| "loss": 0.046, | |
| "mean_token_accuracy": 0.980445384979248, | |
| "num_tokens": 190249042.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.43399274349212646, | |
| "epoch": 36.35, | |
| "grad_norm": 6.533586502075195, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0521, | |
| "mean_token_accuracy": 0.9788219928741455, | |
| "num_tokens": 190511146.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.43022042512893677, | |
| "epoch": 36.4, | |
| "grad_norm": 8.063142776489258, | |
| "learning_rate": 1e-06, | |
| "loss": 0.053, | |
| "mean_token_accuracy": 0.9760554432868958, | |
| "num_tokens": 190773227.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.4320385456085205, | |
| "epoch": 36.45, | |
| "grad_norm": 5.83900260925293, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0455, | |
| "mean_token_accuracy": 0.9809523820877075, | |
| "num_tokens": 191035290.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.4300408363342285, | |
| "epoch": 36.5, | |
| "grad_norm": 5.8197808265686035, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0668, | |
| "mean_token_accuracy": 0.9678930044174194, | |
| "num_tokens": 191297345.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.4326372444629669, | |
| "epoch": 36.55, | |
| "grad_norm": 4.969232082366943, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0426, | |
| "mean_token_accuracy": 0.9821896553039551, | |
| "num_tokens": 191559385.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.4319472312927246, | |
| "epoch": 36.6, | |
| "grad_norm": 5.88218355178833, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0403, | |
| "mean_token_accuracy": 0.9824660420417786, | |
| "num_tokens": 191821416.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.4315246343612671, | |
| "epoch": 36.65, | |
| "grad_norm": 3.370635747909546, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0444, | |
| "mean_token_accuracy": 0.983902096748352, | |
| "num_tokens": 192083430.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.43474385142326355, | |
| "epoch": 36.7, | |
| "grad_norm": 3.9786579608917236, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0453, | |
| "mean_token_accuracy": 0.9828022122383118, | |
| "num_tokens": 192345472.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.4301164150238037, | |
| "epoch": 36.75, | |
| "grad_norm": 5.871670722961426, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0595, | |
| "mean_token_accuracy": 0.9774436354637146, | |
| "num_tokens": 192607528.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.4330886900424957, | |
| "epoch": 36.8, | |
| "grad_norm": 3.582524061203003, | |
| "learning_rate": 1e-06, | |
| "loss": 0.048, | |
| "mean_token_accuracy": 0.9793762564659119, | |
| "num_tokens": 192869581.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.4340907037258148, | |
| "epoch": 36.85, | |
| "grad_norm": 8.269258499145508, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0491, | |
| "mean_token_accuracy": 0.9776817560195923, | |
| "num_tokens": 193131662.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.43378227949142456, | |
| "epoch": 36.9, | |
| "grad_norm": 7.181451797485352, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0462, | |
| "mean_token_accuracy": 0.980169951915741, | |
| "num_tokens": 193393700.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.4317992925643921, | |
| "epoch": 36.95, | |
| "grad_norm": 4.083510398864746, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0549, | |
| "mean_token_accuracy": 0.9743944406509399, | |
| "num_tokens": 193655738.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.4305614233016968, | |
| "epoch": 37.0, | |
| "grad_norm": 5.232417106628418, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0575, | |
| "mean_token_accuracy": 0.9750133156776428, | |
| "num_tokens": 193917750.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_entropy": 0.43406665325164795, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9052419066429138, | |
| "eval_num_tokens": 193917750.0, | |
| "eval_runtime": 0.5662, | |
| "eval_samples_per_second": 441.571, | |
| "eval_steps_per_second": 1.766, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.4319198727607727, | |
| "epoch": 37.05, | |
| "grad_norm": 3.9398393630981445, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0402, | |
| "mean_token_accuracy": 0.9842470288276672, | |
| "num_tokens": 194179798.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.4315286874771118, | |
| "epoch": 37.1, | |
| "grad_norm": 6.6097025871276855, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0466, | |
| "mean_token_accuracy": 0.9826037883758545, | |
| "num_tokens": 194441843.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.4326039254665375, | |
| "epoch": 37.15, | |
| "grad_norm": 4.204853057861328, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0429, | |
| "mean_token_accuracy": 0.9815917611122131, | |
| "num_tokens": 194703887.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.4300069510936737, | |
| "epoch": 37.2, | |
| "grad_norm": 3.7171454429626465, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0437, | |
| "mean_token_accuracy": 0.9782465100288391, | |
| "num_tokens": 194965964.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.43168967962265015, | |
| "epoch": 37.25, | |
| "grad_norm": 3.8260834217071533, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0364, | |
| "mean_token_accuracy": 0.9823232293128967, | |
| "num_tokens": 195228042.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.4312366247177124, | |
| "epoch": 37.3, | |
| "grad_norm": 9.227532386779785, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0567, | |
| "mean_token_accuracy": 0.9740932583808899, | |
| "num_tokens": 195490146.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.43321493268013, | |
| "epoch": 37.35, | |
| "grad_norm": 6.180078983306885, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0491, | |
| "mean_token_accuracy": 0.9776632189750671, | |
| "num_tokens": 195752155.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.43195170164108276, | |
| "epoch": 37.4, | |
| "grad_norm": 5.982614994049072, | |
| "learning_rate": 1e-06, | |
| "loss": 0.058, | |
| "mean_token_accuracy": 0.9748128056526184, | |
| "num_tokens": 196014217.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.43084219098091125, | |
| "epoch": 37.45, | |
| "grad_norm": 3.5786056518554688, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0381, | |
| "mean_token_accuracy": 0.9858906269073486, | |
| "num_tokens": 196276280.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.42843180894851685, | |
| "epoch": 37.5, | |
| "grad_norm": 7.1140360832214355, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0519, | |
| "mean_token_accuracy": 0.9748620390892029, | |
| "num_tokens": 196538306.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.4283401370048523, | |
| "epoch": 37.55, | |
| "grad_norm": 4.959768772125244, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0352, | |
| "mean_token_accuracy": 0.9865038394927979, | |
| "num_tokens": 196800356.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.42931511998176575, | |
| "epoch": 37.6, | |
| "grad_norm": 4.487987995147705, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0487, | |
| "mean_token_accuracy": 0.9777777791023254, | |
| "num_tokens": 197062395.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.43021565675735474, | |
| "epoch": 37.65, | |
| "grad_norm": 5.882363796234131, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0574, | |
| "mean_token_accuracy": 0.9759414196014404, | |
| "num_tokens": 197324405.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.42972004413604736, | |
| "epoch": 37.7, | |
| "grad_norm": 5.719748020172119, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0518, | |
| "mean_token_accuracy": 0.9800443649291992, | |
| "num_tokens": 197586450.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.42973631620407104, | |
| "epoch": 37.75, | |
| "grad_norm": 5.8527398109436035, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0561, | |
| "mean_token_accuracy": 0.9730787873268127, | |
| "num_tokens": 197848459.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.428989052772522, | |
| "epoch": 37.8, | |
| "grad_norm": 6.304094314575195, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0529, | |
| "mean_token_accuracy": 0.9790301322937012, | |
| "num_tokens": 198110512.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.4285169839859009, | |
| "epoch": 37.85, | |
| "grad_norm": 7.316928863525391, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0656, | |
| "mean_token_accuracy": 0.9671322703361511, | |
| "num_tokens": 198372586.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.4262317419052124, | |
| "epoch": 37.9, | |
| "grad_norm": 5.824263095855713, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0475, | |
| "mean_token_accuracy": 0.9809321761131287, | |
| "num_tokens": 198634661.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.43088752031326294, | |
| "epoch": 37.95, | |
| "grad_norm": 4.740274429321289, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0448, | |
| "mean_token_accuracy": 0.9836280345916748, | |
| "num_tokens": 198896715.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.4321492612361908, | |
| "epoch": 38.0, | |
| "grad_norm": 7.113720417022705, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0461, | |
| "mean_token_accuracy": 0.9796651005744934, | |
| "num_tokens": 199158782.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_entropy": 0.4316790699958801, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9079301357269287, | |
| "eval_num_tokens": 199158782.0, | |
| "eval_runtime": 0.5603, | |
| "eval_samples_per_second": 446.212, | |
| "eval_steps_per_second": 1.785, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.4297526180744171, | |
| "epoch": 38.05, | |
| "grad_norm": 6.399072647094727, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0446, | |
| "mean_token_accuracy": 0.9808707237243652, | |
| "num_tokens": 199420825.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.4314311742782593, | |
| "epoch": 38.1, | |
| "grad_norm": 5.302173614501953, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0601, | |
| "mean_token_accuracy": 0.9708454608917236, | |
| "num_tokens": 199682902.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.4309634566307068, | |
| "epoch": 38.15, | |
| "grad_norm": 6.87785005569458, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0541, | |
| "mean_token_accuracy": 0.976190447807312, | |
| "num_tokens": 199944938.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.43027186393737793, | |
| "epoch": 38.2, | |
| "grad_norm": 5.8634138107299805, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0414, | |
| "mean_token_accuracy": 0.9833240509033203, | |
| "num_tokens": 200207000.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.4300784468650818, | |
| "epoch": 38.25, | |
| "grad_norm": 17.801645278930664, | |
| "learning_rate": 1e-06, | |
| "loss": 0.033, | |
| "mean_token_accuracy": 0.9908369183540344, | |
| "num_tokens": 200469065.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.43063056468963623, | |
| "epoch": 38.3, | |
| "grad_norm": 4.167501926422119, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0519, | |
| "mean_token_accuracy": 0.97929847240448, | |
| "num_tokens": 200731067.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.4308306872844696, | |
| "epoch": 38.35, | |
| "grad_norm": 4.516330718994141, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0485, | |
| "mean_token_accuracy": 0.9816091656684875, | |
| "num_tokens": 200993097.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.4297667145729065, | |
| "epoch": 38.4, | |
| "grad_norm": 4.7929277420043945, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0488, | |
| "mean_token_accuracy": 0.9790105223655701, | |
| "num_tokens": 201255156.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.4306723475456238, | |
| "epoch": 38.45, | |
| "grad_norm": 4.585225582122803, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0455, | |
| "mean_token_accuracy": 0.9819868803024292, | |
| "num_tokens": 201517185.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.4299125671386719, | |
| "epoch": 38.5, | |
| "grad_norm": 4.201162815093994, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0431, | |
| "mean_token_accuracy": 0.9805615544319153, | |
| "num_tokens": 201779233.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.43096548318862915, | |
| "epoch": 38.55, | |
| "grad_norm": 5.361374855041504, | |
| "learning_rate": 1e-06, | |
| "loss": 0.05, | |
| "mean_token_accuracy": 0.9783693552017212, | |
| "num_tokens": 202041266.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.4295889735221863, | |
| "epoch": 38.6, | |
| "grad_norm": 7.002900123596191, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0585, | |
| "mean_token_accuracy": 0.9755419492721558, | |
| "num_tokens": 202303295.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.42901116609573364, | |
| "epoch": 38.65, | |
| "grad_norm": 10.230154037475586, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0523, | |
| "mean_token_accuracy": 0.9762485027313232, | |
| "num_tokens": 202565332.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.42743608355522156, | |
| "epoch": 38.7, | |
| "grad_norm": 5.833381175994873, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0582, | |
| "mean_token_accuracy": 0.9707192778587341, | |
| "num_tokens": 202827364.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.42607641220092773, | |
| "epoch": 38.75, | |
| "grad_norm": 6.477557182312012, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0551, | |
| "mean_token_accuracy": 0.974967896938324, | |
| "num_tokens": 203089449.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.42739659547805786, | |
| "epoch": 38.8, | |
| "grad_norm": 7.6487627029418945, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0333, | |
| "mean_token_accuracy": 0.9863269329071045, | |
| "num_tokens": 203351519.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.4268617630004883, | |
| "epoch": 38.85, | |
| "grad_norm": 4.835480213165283, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0378, | |
| "mean_token_accuracy": 0.9873916506767273, | |
| "num_tokens": 203613612.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.4261825680732727, | |
| "epoch": 38.9, | |
| "grad_norm": 5.235621452331543, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0496, | |
| "mean_token_accuracy": 0.98097825050354, | |
| "num_tokens": 203875682.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.4306997060775757, | |
| "epoch": 38.95, | |
| "grad_norm": 6.902498722076416, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0538, | |
| "mean_token_accuracy": 0.9750712513923645, | |
| "num_tokens": 204137745.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.4279959797859192, | |
| "epoch": 39.0, | |
| "grad_norm": 5.471578598022461, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0479, | |
| "mean_token_accuracy": 0.9792307615280151, | |
| "num_tokens": 204399803.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_entropy": 0.4311733841896057, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9106183052062988, | |
| "eval_num_tokens": 204399803.0, | |
| "eval_runtime": 0.563, | |
| "eval_samples_per_second": 444.061, | |
| "eval_steps_per_second": 1.776, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.4303116500377655, | |
| "epoch": 39.05, | |
| "grad_norm": 5.4172492027282715, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0538, | |
| "mean_token_accuracy": 0.9733893275260925, | |
| "num_tokens": 204661857.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.4293464422225952, | |
| "epoch": 39.1, | |
| "grad_norm": 12.146878242492676, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0566, | |
| "mean_token_accuracy": 0.9732072353363037, | |
| "num_tokens": 204923950.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.4306670129299164, | |
| "epoch": 39.15, | |
| "grad_norm": 11.113872528076172, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0495, | |
| "mean_token_accuracy": 0.9793368577957153, | |
| "num_tokens": 205186030.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.4303710460662842, | |
| "epoch": 39.2, | |
| "grad_norm": 4.949894905090332, | |
| "learning_rate": 1e-06, | |
| "loss": 0.046, | |
| "mean_token_accuracy": 0.9795244336128235, | |
| "num_tokens": 205448071.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.4308937191963196, | |
| "epoch": 39.25, | |
| "grad_norm": 6.6487650871276855, | |
| "learning_rate": 1e-06, | |
| "loss": 0.048, | |
| "mean_token_accuracy": 0.9802955389022827, | |
| "num_tokens": 205710085.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.42958366870880127, | |
| "epoch": 39.3, | |
| "grad_norm": 6.201844692230225, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0487, | |
| "mean_token_accuracy": 0.97826087474823, | |
| "num_tokens": 205972150.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.43153223395347595, | |
| "epoch": 39.35, | |
| "grad_norm": 8.383527755737305, | |
| "learning_rate": 1e-06, | |
| "loss": 0.043, | |
| "mean_token_accuracy": 0.9783783555030823, | |
| "num_tokens": 206234177.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.4291541278362274, | |
| "epoch": 39.4, | |
| "grad_norm": 4.192736625671387, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0371, | |
| "mean_token_accuracy": 0.9854142069816589, | |
| "num_tokens": 206496187.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.4308379590511322, | |
| "epoch": 39.45, | |
| "grad_norm": 7.078681468963623, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0397, | |
| "mean_token_accuracy": 0.9830328822135925, | |
| "num_tokens": 206758204.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.42960792779922485, | |
| "epoch": 39.5, | |
| "grad_norm": 3.608894109725952, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0301, | |
| "mean_token_accuracy": 0.9880810379981995, | |
| "num_tokens": 207020244.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.4298979938030243, | |
| "epoch": 39.55, | |
| "grad_norm": 4.762541770935059, | |
| "learning_rate": 1e-06, | |
| "loss": 0.051, | |
| "mean_token_accuracy": 0.9795918464660645, | |
| "num_tokens": 207282305.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.43151146173477173, | |
| "epoch": 39.6, | |
| "grad_norm": 3.833782434463501, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0453, | |
| "mean_token_accuracy": 0.982624351978302, | |
| "num_tokens": 207544369.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.4292357563972473, | |
| "epoch": 39.65, | |
| "grad_norm": 4.718764781951904, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0395, | |
| "mean_token_accuracy": 0.9843843579292297, | |
| "num_tokens": 207806396.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.4275195598602295, | |
| "epoch": 39.7, | |
| "grad_norm": 4.482115268707275, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0482, | |
| "mean_token_accuracy": 0.980169951915741, | |
| "num_tokens": 208068467.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.4273834824562073, | |
| "epoch": 39.75, | |
| "grad_norm": 8.38062572479248, | |
| "learning_rate": 1e-06, | |
| "loss": 0.055, | |
| "mean_token_accuracy": 0.9755538702011108, | |
| "num_tokens": 208330534.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.4268650710582733, | |
| "epoch": 39.8, | |
| "grad_norm": 7.30850887298584, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0434, | |
| "mean_token_accuracy": 0.9825853705406189, | |
| "num_tokens": 208592587.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.42521435022354126, | |
| "epoch": 39.85, | |
| "grad_norm": 5.413529396057129, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0459, | |
| "mean_token_accuracy": 0.9793140888214111, | |
| "num_tokens": 208854654.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.4262104332447052, | |
| "epoch": 39.9, | |
| "grad_norm": 8.122570991516113, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0598, | |
| "mean_token_accuracy": 0.9763879776000977, | |
| "num_tokens": 209116715.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.4243553578853607, | |
| "epoch": 39.95, | |
| "grad_norm": 11.898737907409668, | |
| "learning_rate": 1e-06, | |
| "loss": 0.051, | |
| "mean_token_accuracy": 0.9789808988571167, | |
| "num_tokens": 209378779.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.42552608251571655, | |
| "epoch": 40.0, | |
| "grad_norm": 4.8858723640441895, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0554, | |
| "mean_token_accuracy": 0.974328339099884, | |
| "num_tokens": 209640816.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_entropy": 0.4265006184577942, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8985214829444885, | |
| "eval_num_tokens": 209640816.0, | |
| "eval_runtime": 0.5639, | |
| "eval_samples_per_second": 443.309, | |
| "eval_steps_per_second": 1.773, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.42547088861465454, | |
| "epoch": 40.05, | |
| "grad_norm": 3.928428888320923, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0336, | |
| "mean_token_accuracy": 0.9873657822608948, | |
| "num_tokens": 209902893.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.4265702962875366, | |
| "epoch": 40.1, | |
| "grad_norm": 6.94235372543335, | |
| "learning_rate": 1e-06, | |
| "loss": 0.049, | |
| "mean_token_accuracy": 0.9771819710731506, | |
| "num_tokens": 210164942.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.42739298939704895, | |
| "epoch": 40.15, | |
| "grad_norm": 10.066420555114746, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0489, | |
| "mean_token_accuracy": 0.9780876636505127, | |
| "num_tokens": 210426975.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.4282771050930023, | |
| "epoch": 40.2, | |
| "grad_norm": 4.0888848304748535, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0416, | |
| "mean_token_accuracy": 0.9854447245597839, | |
| "num_tokens": 210689060.0, | |
| "step": 804 | |
| }, | |
| { | |
| "entropy": 0.4287988841533661, | |
| "epoch": 40.25, | |
| "grad_norm": 5.135344505310059, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0305, | |
| "mean_token_accuracy": 0.9879999756813049, | |
| "num_tokens": 210951087.0, | |
| "step": 805 | |
| }, | |
| { | |
| "entropy": 0.4283785820007324, | |
| "epoch": 40.3, | |
| "grad_norm": 7.806493759155273, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0605, | |
| "mean_token_accuracy": 0.9761051535606384, | |
| "num_tokens": 211213123.0, | |
| "step": 806 | |
| }, | |
| { | |
| "entropy": 0.42876169085502625, | |
| "epoch": 40.35, | |
| "grad_norm": 4.562885761260986, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0432, | |
| "mean_token_accuracy": 0.9820144176483154, | |
| "num_tokens": 211475167.0, | |
| "step": 807 | |
| }, | |
| { | |
| "entropy": 0.42846542596817017, | |
| "epoch": 40.4, | |
| "grad_norm": 5.972072124481201, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0439, | |
| "mean_token_accuracy": 0.9850237965583801, | |
| "num_tokens": 211737196.0, | |
| "step": 808 | |
| }, | |
| { | |
| "entropy": 0.42836448550224304, | |
| "epoch": 40.45, | |
| "grad_norm": 4.6945319175720215, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0395, | |
| "mean_token_accuracy": 0.9843930602073669, | |
| "num_tokens": 211999255.0, | |
| "step": 809 | |
| }, | |
| { | |
| "entropy": 0.4259374439716339, | |
| "epoch": 40.5, | |
| "grad_norm": 7.984348773956299, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0521, | |
| "mean_token_accuracy": 0.9777777791023254, | |
| "num_tokens": 212261345.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.4267132580280304, | |
| "epoch": 40.55, | |
| "grad_norm": 5.722595691680908, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0537, | |
| "mean_token_accuracy": 0.9772727489471436, | |
| "num_tokens": 212523401.0, | |
| "step": 811 | |
| }, | |
| { | |
| "entropy": 0.42591041326522827, | |
| "epoch": 40.6, | |
| "grad_norm": 8.059563636779785, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0512, | |
| "mean_token_accuracy": 0.9775811433792114, | |
| "num_tokens": 212785491.0, | |
| "step": 812 | |
| }, | |
| { | |
| "entropy": 0.4272247552871704, | |
| "epoch": 40.65, | |
| "grad_norm": 4.5345869064331055, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0423, | |
| "mean_token_accuracy": 0.9814593195915222, | |
| "num_tokens": 213047558.0, | |
| "step": 813 | |
| }, | |
| { | |
| "entropy": 0.4253997802734375, | |
| "epoch": 40.7, | |
| "grad_norm": 10.771305084228516, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0596, | |
| "mean_token_accuracy": 0.972937285900116, | |
| "num_tokens": 213309600.0, | |
| "step": 814 | |
| }, | |
| { | |
| "entropy": 0.4264408349990845, | |
| "epoch": 40.75, | |
| "grad_norm": 8.536927223205566, | |
| "learning_rate": 1e-06, | |
| "loss": 0.058, | |
| "mean_token_accuracy": 0.9755164384841919, | |
| "num_tokens": 213571632.0, | |
| "step": 815 | |
| }, | |
| { | |
| "entropy": 0.42377805709838867, | |
| "epoch": 40.8, | |
| "grad_norm": 6.131271839141846, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0382, | |
| "mean_token_accuracy": 0.9832285046577454, | |
| "num_tokens": 213833689.0, | |
| "step": 816 | |
| }, | |
| { | |
| "entropy": 0.4256356656551361, | |
| "epoch": 40.85, | |
| "grad_norm": 5.8921613693237305, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0491, | |
| "mean_token_accuracy": 0.9806221127510071, | |
| "num_tokens": 214095715.0, | |
| "step": 817 | |
| }, | |
| { | |
| "entropy": 0.42740219831466675, | |
| "epoch": 40.9, | |
| "grad_norm": 7.05807638168335, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0419, | |
| "mean_token_accuracy": 0.9825970530509949, | |
| "num_tokens": 214357769.0, | |
| "step": 818 | |
| }, | |
| { | |
| "entropy": 0.42619818449020386, | |
| "epoch": 40.95, | |
| "grad_norm": 8.339810371398926, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0539, | |
| "mean_token_accuracy": 0.9760000109672546, | |
| "num_tokens": 214619789.0, | |
| "step": 819 | |
| }, | |
| { | |
| "entropy": 0.4287991523742676, | |
| "epoch": 41.0, | |
| "grad_norm": 6.295149326324463, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0439, | |
| "mean_token_accuracy": 0.981502890586853, | |
| "num_tokens": 214881848.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_entropy": 0.42802900075912476, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9032257795333862, | |
| "eval_num_tokens": 214881848.0, | |
| "eval_runtime": 0.5634, | |
| "eval_samples_per_second": 443.721, | |
| "eval_steps_per_second": 1.775, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.42603129148483276, | |
| "epoch": 41.05, | |
| "grad_norm": 4.932643890380859, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0385, | |
| "mean_token_accuracy": 0.9828060269355774, | |
| "num_tokens": 215143928.0, | |
| "step": 821 | |
| }, | |
| { | |
| "entropy": 0.4281240701675415, | |
| "epoch": 41.1, | |
| "grad_norm": 8.819388389587402, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0526, | |
| "mean_token_accuracy": 0.977624773979187, | |
| "num_tokens": 215405967.0, | |
| "step": 822 | |
| }, | |
| { | |
| "entropy": 0.4279868006706238, | |
| "epoch": 41.15, | |
| "grad_norm": 5.604109764099121, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0405, | |
| "mean_token_accuracy": 0.9837586879730225, | |
| "num_tokens": 215668020.0, | |
| "step": 823 | |
| }, | |
| { | |
| "entropy": 0.4266362190246582, | |
| "epoch": 41.2, | |
| "grad_norm": 5.590734481811523, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0478, | |
| "mean_token_accuracy": 0.978723406791687, | |
| "num_tokens": 215930074.0, | |
| "step": 824 | |
| }, | |
| { | |
| "entropy": 0.42430201172828674, | |
| "epoch": 41.25, | |
| "grad_norm": 6.548094749450684, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0404, | |
| "mean_token_accuracy": 0.9815140962600708, | |
| "num_tokens": 216192133.0, | |
| "step": 825 | |
| }, | |
| { | |
| "entropy": 0.4260609447956085, | |
| "epoch": 41.3, | |
| "grad_norm": 4.57349967956543, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0441, | |
| "mean_token_accuracy": 0.981225848197937, | |
| "num_tokens": 216454174.0, | |
| "step": 826 | |
| }, | |
| { | |
| "entropy": 0.42658817768096924, | |
| "epoch": 41.35, | |
| "grad_norm": 6.89821195602417, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0463, | |
| "mean_token_accuracy": 0.9805615544319153, | |
| "num_tokens": 216716222.0, | |
| "step": 827 | |
| }, | |
| { | |
| "entropy": 0.4260250926017761, | |
| "epoch": 41.4, | |
| "grad_norm": 5.968894958496094, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0421, | |
| "mean_token_accuracy": 0.9821937084197998, | |
| "num_tokens": 216978285.0, | |
| "step": 828 | |
| }, | |
| { | |
| "entropy": 0.4265430271625519, | |
| "epoch": 41.45, | |
| "grad_norm": 4.956072807312012, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0581, | |
| "mean_token_accuracy": 0.9706303477287292, | |
| "num_tokens": 217240373.0, | |
| "step": 829 | |
| }, | |
| { | |
| "entropy": 0.4250110387802124, | |
| "epoch": 41.5, | |
| "grad_norm": 5.861893177032471, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0393, | |
| "mean_token_accuracy": 0.9804489612579346, | |
| "num_tokens": 217502413.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.42582571506500244, | |
| "epoch": 41.55, | |
| "grad_norm": 6.167220592498779, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0489, | |
| "mean_token_accuracy": 0.9791246056556702, | |
| "num_tokens": 217764491.0, | |
| "step": 831 | |
| }, | |
| { | |
| "entropy": 0.42461884021759033, | |
| "epoch": 41.6, | |
| "grad_norm": 6.987247943878174, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0438, | |
| "mean_token_accuracy": 0.9817232489585876, | |
| "num_tokens": 218026550.0, | |
| "step": 832 | |
| }, | |
| { | |
| "entropy": 0.4255552291870117, | |
| "epoch": 41.65, | |
| "grad_norm": 3.994992256164551, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0361, | |
| "mean_token_accuracy": 0.9873896837234497, | |
| "num_tokens": 218288597.0, | |
| "step": 833 | |
| }, | |
| { | |
| "entropy": 0.42477789521217346, | |
| "epoch": 41.7, | |
| "grad_norm": 4.242578506469727, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0366, | |
| "mean_token_accuracy": 0.9860182404518127, | |
| "num_tokens": 218550637.0, | |
| "step": 834 | |
| }, | |
| { | |
| "entropy": 0.42430806159973145, | |
| "epoch": 41.75, | |
| "grad_norm": 4.776451110839844, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0477, | |
| "mean_token_accuracy": 0.9796454310417175, | |
| "num_tokens": 218812720.0, | |
| "step": 835 | |
| }, | |
| { | |
| "entropy": 0.42646872997283936, | |
| "epoch": 41.8, | |
| "grad_norm": 4.909146785736084, | |
| "learning_rate": 1e-06, | |
| "loss": 0.052, | |
| "mean_token_accuracy": 0.974078357219696, | |
| "num_tokens": 219074719.0, | |
| "step": 836 | |
| }, | |
| { | |
| "entropy": 0.4244542121887207, | |
| "epoch": 41.85, | |
| "grad_norm": 5.178461074829102, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0413, | |
| "mean_token_accuracy": 0.9836763739585876, | |
| "num_tokens": 219336787.0, | |
| "step": 837 | |
| }, | |
| { | |
| "entropy": 0.4257306456565857, | |
| "epoch": 41.9, | |
| "grad_norm": 2.8526721000671387, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0405, | |
| "mean_token_accuracy": 0.984000027179718, | |
| "num_tokens": 219598826.0, | |
| "step": 838 | |
| }, | |
| { | |
| "entropy": 0.4252172112464905, | |
| "epoch": 41.95, | |
| "grad_norm": 3.7361361980438232, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0477, | |
| "mean_token_accuracy": 0.98320072889328, | |
| "num_tokens": 219860823.0, | |
| "step": 839 | |
| }, | |
| { | |
| "entropy": 0.4218195080757141, | |
| "epoch": 42.0, | |
| "grad_norm": 8.049124717712402, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0335, | |
| "mean_token_accuracy": 0.9856651425361633, | |
| "num_tokens": 220122896.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_entropy": 0.4240899682044983, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9005376100540161, | |
| "eval_num_tokens": 220122896.0, | |
| "eval_runtime": 0.5603, | |
| "eval_samples_per_second": 446.2, | |
| "eval_steps_per_second": 1.785, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.42247503995895386, | |
| "epoch": 42.05, | |
| "grad_norm": 4.249022483825684, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0493, | |
| "mean_token_accuracy": 0.9803094267845154, | |
| "num_tokens": 220384944.0, | |
| "step": 841 | |
| }, | |
| { | |
| "entropy": 0.42273712158203125, | |
| "epoch": 42.1, | |
| "grad_norm": 3.7368266582489014, | |
| "learning_rate": 1e-06, | |
| "loss": 0.05, | |
| "mean_token_accuracy": 0.9767295718193054, | |
| "num_tokens": 220646995.0, | |
| "step": 842 | |
| }, | |
| { | |
| "entropy": 0.4206734001636505, | |
| "epoch": 42.15, | |
| "grad_norm": 2.553225040435791, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0397, | |
| "mean_token_accuracy": 0.9799764156341553, | |
| "num_tokens": 220909055.0, | |
| "step": 843 | |
| }, | |
| { | |
| "entropy": 0.42262324690818787, | |
| "epoch": 42.2, | |
| "grad_norm": 4.007044792175293, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0338, | |
| "mean_token_accuracy": 0.9858860373497009, | |
| "num_tokens": 221171099.0, | |
| "step": 844 | |
| }, | |
| { | |
| "entropy": 0.4206918478012085, | |
| "epoch": 42.25, | |
| "grad_norm": 5.613060474395752, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0499, | |
| "mean_token_accuracy": 0.9805825352668762, | |
| "num_tokens": 221433171.0, | |
| "step": 845 | |
| }, | |
| { | |
| "entropy": 0.4224393963813782, | |
| "epoch": 42.3, | |
| "grad_norm": 7.5523200035095215, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0302, | |
| "mean_token_accuracy": 0.9912751913070679, | |
| "num_tokens": 221695221.0, | |
| "step": 846 | |
| }, | |
| { | |
| "entropy": 0.4227662980556488, | |
| "epoch": 42.35, | |
| "grad_norm": 3.233724594116211, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0431, | |
| "mean_token_accuracy": 0.9814459085464478, | |
| "num_tokens": 221957278.0, | |
| "step": 847 | |
| }, | |
| { | |
| "entropy": 0.4223962128162384, | |
| "epoch": 42.4, | |
| "grad_norm": 5.855934143066406, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0398, | |
| "mean_token_accuracy": 0.9815050959587097, | |
| "num_tokens": 222219340.0, | |
| "step": 848 | |
| }, | |
| { | |
| "entropy": 0.42316097021102905, | |
| "epoch": 42.45, | |
| "grad_norm": 4.901780605316162, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0429, | |
| "mean_token_accuracy": 0.9836879372596741, | |
| "num_tokens": 222481409.0, | |
| "step": 849 | |
| }, | |
| { | |
| "entropy": 0.42461538314819336, | |
| "epoch": 42.5, | |
| "grad_norm": 5.965307712554932, | |
| "learning_rate": 1e-06, | |
| "loss": 0.044, | |
| "mean_token_accuracy": 0.9818791747093201, | |
| "num_tokens": 222743459.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.42335671186447144, | |
| "epoch": 42.55, | |
| "grad_norm": 4.5567426681518555, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0458, | |
| "mean_token_accuracy": 0.981582522392273, | |
| "num_tokens": 223005518.0, | |
| "step": 851 | |
| }, | |
| { | |
| "entropy": 0.4207357168197632, | |
| "epoch": 42.6, | |
| "grad_norm": 6.25122594833374, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0336, | |
| "mean_token_accuracy": 0.9905003309249878, | |
| "num_tokens": 223267591.0, | |
| "step": 852 | |
| }, | |
| { | |
| "entropy": 0.42162737250328064, | |
| "epoch": 42.65, | |
| "grad_norm": 5.886632442474365, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0428, | |
| "mean_token_accuracy": 0.9807093739509583, | |
| "num_tokens": 223529626.0, | |
| "step": 853 | |
| }, | |
| { | |
| "entropy": 0.4217204749584198, | |
| "epoch": 42.7, | |
| "grad_norm": 4.245472431182861, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0367, | |
| "mean_token_accuracy": 0.9864498376846313, | |
| "num_tokens": 223791695.0, | |
| "step": 854 | |
| }, | |
| { | |
| "entropy": 0.42133527994155884, | |
| "epoch": 42.75, | |
| "grad_norm": 3.2346744537353516, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0394, | |
| "mean_token_accuracy": 0.9854153394699097, | |
| "num_tokens": 224053733.0, | |
| "step": 855 | |
| }, | |
| { | |
| "entropy": 0.4236084222793579, | |
| "epoch": 42.8, | |
| "grad_norm": 3.6470589637756348, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0518, | |
| "mean_token_accuracy": 0.9786067008972168, | |
| "num_tokens": 224315753.0, | |
| "step": 856 | |
| }, | |
| { | |
| "entropy": 0.42061948776245117, | |
| "epoch": 42.85, | |
| "grad_norm": 5.6351423263549805, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0392, | |
| "mean_token_accuracy": 0.9876466989517212, | |
| "num_tokens": 224577800.0, | |
| "step": 857 | |
| }, | |
| { | |
| "entropy": 0.4228881597518921, | |
| "epoch": 42.9, | |
| "grad_norm": 2.740384101867676, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0423, | |
| "mean_token_accuracy": 0.9824660420417786, | |
| "num_tokens": 224839831.0, | |
| "step": 858 | |
| }, | |
| { | |
| "entropy": 0.4235576391220093, | |
| "epoch": 42.95, | |
| "grad_norm": 3.5690433979034424, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0525, | |
| "mean_token_accuracy": 0.9748982191085815, | |
| "num_tokens": 225101898.0, | |
| "step": 859 | |
| }, | |
| { | |
| "entropy": 0.4226834177970886, | |
| "epoch": 43.0, | |
| "grad_norm": 5.949954986572266, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0388, | |
| "mean_token_accuracy": 0.9843918085098267, | |
| "num_tokens": 225363920.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_entropy": 0.42189276218414307, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.899193525314331, | |
| "eval_num_tokens": 225363920.0, | |
| "eval_runtime": 0.5598, | |
| "eval_samples_per_second": 446.553, | |
| "eval_steps_per_second": 1.786, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.4208260178565979, | |
| "epoch": 43.05, | |
| "grad_norm": 3.177051544189453, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0438, | |
| "mean_token_accuracy": 0.980252742767334, | |
| "num_tokens": 225625977.0, | |
| "step": 861 | |
| }, | |
| { | |
| "entropy": 0.42219871282577515, | |
| "epoch": 43.1, | |
| "grad_norm": 4.462610721588135, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0451, | |
| "mean_token_accuracy": 0.9779735803604126, | |
| "num_tokens": 225888027.0, | |
| "step": 862 | |
| }, | |
| { | |
| "entropy": 0.420362651348114, | |
| "epoch": 43.15, | |
| "grad_norm": 2.9560749530792236, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0343, | |
| "mean_token_accuracy": 0.9873754382133484, | |
| "num_tokens": 226150059.0, | |
| "step": 863 | |
| }, | |
| { | |
| "entropy": 0.4182065427303314, | |
| "epoch": 43.2, | |
| "grad_norm": 6.363971710205078, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0484, | |
| "mean_token_accuracy": 0.982958972454071, | |
| "num_tokens": 226412141.0, | |
| "step": 864 | |
| }, | |
| { | |
| "entropy": 0.42202645540237427, | |
| "epoch": 43.25, | |
| "grad_norm": 4.371028900146484, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0365, | |
| "mean_token_accuracy": 0.985382080078125, | |
| "num_tokens": 226674206.0, | |
| "step": 865 | |
| }, | |
| { | |
| "entropy": 0.419817179441452, | |
| "epoch": 43.3, | |
| "grad_norm": 5.31802225112915, | |
| "learning_rate": 1e-06, | |
| "loss": 0.051, | |
| "mean_token_accuracy": 0.9777777791023254, | |
| "num_tokens": 226936212.0, | |
| "step": 866 | |
| }, | |
| { | |
| "entropy": 0.42121073603630066, | |
| "epoch": 43.35, | |
| "grad_norm": 5.029830455780029, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0318, | |
| "mean_token_accuracy": 0.9885404109954834, | |
| "num_tokens": 227198232.0, | |
| "step": 867 | |
| }, | |
| { | |
| "entropy": 0.4217337667942047, | |
| "epoch": 43.4, | |
| "grad_norm": 2.7915053367614746, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0345, | |
| "mean_token_accuracy": 0.9873780608177185, | |
| "num_tokens": 227460304.0, | |
| "step": 868 | |
| }, | |
| { | |
| "entropy": 0.42147403955459595, | |
| "epoch": 43.45, | |
| "grad_norm": 4.434348106384277, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0575, | |
| "mean_token_accuracy": 0.9757281541824341, | |
| "num_tokens": 227722380.0, | |
| "step": 869 | |
| }, | |
| { | |
| "entropy": 0.42027783393859863, | |
| "epoch": 43.5, | |
| "grad_norm": 6.0223283767700195, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0434, | |
| "mean_token_accuracy": 0.981566846370697, | |
| "num_tokens": 227984393.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.4178963899612427, | |
| "epoch": 43.55, | |
| "grad_norm": 4.975872993469238, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0364, | |
| "mean_token_accuracy": 0.9844054579734802, | |
| "num_tokens": 228246459.0, | |
| "step": 871 | |
| }, | |
| { | |
| "entropy": 0.41726964712142944, | |
| "epoch": 43.6, | |
| "grad_norm": 4.340257167816162, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0346, | |
| "mean_token_accuracy": 0.9853479862213135, | |
| "num_tokens": 228508516.0, | |
| "step": 872 | |
| }, | |
| { | |
| "entropy": 0.41982972621917725, | |
| "epoch": 43.65, | |
| "grad_norm": 6.434086322784424, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0433, | |
| "mean_token_accuracy": 0.9811320900917053, | |
| "num_tokens": 228770542.0, | |
| "step": 873 | |
| }, | |
| { | |
| "entropy": 0.4189579486846924, | |
| "epoch": 43.7, | |
| "grad_norm": 5.864006519317627, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0444, | |
| "mean_token_accuracy": 0.9811431765556335, | |
| "num_tokens": 229032568.0, | |
| "step": 874 | |
| }, | |
| { | |
| "entropy": 0.4170803129673004, | |
| "epoch": 43.75, | |
| "grad_norm": 6.057931423187256, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0392, | |
| "mean_token_accuracy": 0.9832439422607422, | |
| "num_tokens": 229294620.0, | |
| "step": 875 | |
| }, | |
| { | |
| "entropy": 0.41728881001472473, | |
| "epoch": 43.8, | |
| "grad_norm": 2.91896653175354, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0402, | |
| "mean_token_accuracy": 0.9860140085220337, | |
| "num_tokens": 229556687.0, | |
| "step": 876 | |
| }, | |
| { | |
| "entropy": 0.4162396192550659, | |
| "epoch": 43.85, | |
| "grad_norm": 8.440881729125977, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0462, | |
| "mean_token_accuracy": 0.9821428656578064, | |
| "num_tokens": 229818749.0, | |
| "step": 877 | |
| }, | |
| { | |
| "entropy": 0.4187852144241333, | |
| "epoch": 43.9, | |
| "grad_norm": 5.375515937805176, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0492, | |
| "mean_token_accuracy": 0.9800754189491272, | |
| "num_tokens": 230080803.0, | |
| "step": 878 | |
| }, | |
| { | |
| "entropy": 0.41738641262054443, | |
| "epoch": 43.95, | |
| "grad_norm": 7.682968616485596, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0373, | |
| "mean_token_accuracy": 0.9865732789039612, | |
| "num_tokens": 230342878.0, | |
| "step": 879 | |
| }, | |
| { | |
| "entropy": 0.4192846417427063, | |
| "epoch": 44.0, | |
| "grad_norm": 5.086990833282471, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0388, | |
| "mean_token_accuracy": 0.9832605719566345, | |
| "num_tokens": 230604944.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_entropy": 0.42026257514953613, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9079301357269287, | |
| "eval_num_tokens": 230604944.0, | |
| "eval_runtime": 0.5735, | |
| "eval_samples_per_second": 435.937, | |
| "eval_steps_per_second": 1.744, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.4180365204811096, | |
| "epoch": 44.05, | |
| "grad_norm": 3.057579755783081, | |
| "learning_rate": 1e-06, | |
| "loss": 0.032, | |
| "mean_token_accuracy": 0.9887217879295349, | |
| "num_tokens": 230867034.0, | |
| "step": 881 | |
| }, | |
| { | |
| "entropy": 0.4178960621356964, | |
| "epoch": 44.1, | |
| "grad_norm": 5.593236446380615, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0315, | |
| "mean_token_accuracy": 0.9854651093482971, | |
| "num_tokens": 231129083.0, | |
| "step": 882 | |
| }, | |
| { | |
| "entropy": 0.4179683327674866, | |
| "epoch": 44.15, | |
| "grad_norm": 5.593606948852539, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0466, | |
| "mean_token_accuracy": 0.9800342321395874, | |
| "num_tokens": 231391132.0, | |
| "step": 883 | |
| }, | |
| { | |
| "entropy": 0.4196028411388397, | |
| "epoch": 44.2, | |
| "grad_norm": 3.4420926570892334, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0384, | |
| "mean_token_accuracy": 0.9837988615036011, | |
| "num_tokens": 231653185.0, | |
| "step": 884 | |
| }, | |
| { | |
| "entropy": 0.4175465703010559, | |
| "epoch": 44.25, | |
| "grad_norm": 6.346271991729736, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0478, | |
| "mean_token_accuracy": 0.9762389659881592, | |
| "num_tokens": 231915251.0, | |
| "step": 885 | |
| }, | |
| { | |
| "entropy": 0.416942834854126, | |
| "epoch": 44.3, | |
| "grad_norm": 9.280478477478027, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0313, | |
| "mean_token_accuracy": 0.9847715497016907, | |
| "num_tokens": 232177321.0, | |
| "step": 886 | |
| }, | |
| { | |
| "entropy": 0.42018964886665344, | |
| "epoch": 44.35, | |
| "grad_norm": 6.544849395751953, | |
| "learning_rate": 1e-06, | |
| "loss": 0.043, | |
| "mean_token_accuracy": 0.9793233275413513, | |
| "num_tokens": 232439378.0, | |
| "step": 887 | |
| }, | |
| { | |
| "entropy": 0.41923967003822327, | |
| "epoch": 44.4, | |
| "grad_norm": 2.252636432647705, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0417, | |
| "mean_token_accuracy": 0.9808584451675415, | |
| "num_tokens": 232701431.0, | |
| "step": 888 | |
| }, | |
| { | |
| "entropy": 0.4177750051021576, | |
| "epoch": 44.45, | |
| "grad_norm": 2.6777243614196777, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0395, | |
| "mean_token_accuracy": 0.9847931861877441, | |
| "num_tokens": 232963470.0, | |
| "step": 889 | |
| }, | |
| { | |
| "entropy": 0.41958120465278625, | |
| "epoch": 44.5, | |
| "grad_norm": 3.8446385860443115, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0369, | |
| "mean_token_accuracy": 0.9842866063117981, | |
| "num_tokens": 233225456.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.4201487600803375, | |
| "epoch": 44.55, | |
| "grad_norm": 2.9724559783935547, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0331, | |
| "mean_token_accuracy": 0.9868995547294617, | |
| "num_tokens": 233487520.0, | |
| "step": 891 | |
| }, | |
| { | |
| "entropy": 0.41942694783210754, | |
| "epoch": 44.6, | |
| "grad_norm": 4.230043888092041, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0513, | |
| "mean_token_accuracy": 0.9784172773361206, | |
| "num_tokens": 233749531.0, | |
| "step": 892 | |
| }, | |
| { | |
| "entropy": 0.41816410422325134, | |
| "epoch": 44.65, | |
| "grad_norm": 7.377174377441406, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0397, | |
| "mean_token_accuracy": 0.9815521836280823, | |
| "num_tokens": 234011564.0, | |
| "step": 893 | |
| }, | |
| { | |
| "entropy": 0.41752010583877563, | |
| "epoch": 44.7, | |
| "grad_norm": 10.760427474975586, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0304, | |
| "mean_token_accuracy": 0.9876638650894165, | |
| "num_tokens": 234273619.0, | |
| "step": 894 | |
| }, | |
| { | |
| "entropy": 0.41638725996017456, | |
| "epoch": 44.75, | |
| "grad_norm": 12.622779846191406, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0445, | |
| "mean_token_accuracy": 0.981951892375946, | |
| "num_tokens": 234535642.0, | |
| "step": 895 | |
| }, | |
| { | |
| "entropy": 0.41678711771965027, | |
| "epoch": 44.8, | |
| "grad_norm": 5.13592004776001, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0378, | |
| "mean_token_accuracy": 0.9822485446929932, | |
| "num_tokens": 234797723.0, | |
| "step": 896 | |
| }, | |
| { | |
| "entropy": 0.417410671710968, | |
| "epoch": 44.85, | |
| "grad_norm": 4.6480607986450195, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0461, | |
| "mean_token_accuracy": 0.9833546876907349, | |
| "num_tokens": 235059779.0, | |
| "step": 897 | |
| }, | |
| { | |
| "entropy": 0.41528117656707764, | |
| "epoch": 44.9, | |
| "grad_norm": 5.57151985168457, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0402, | |
| "mean_token_accuracy": 0.9841897487640381, | |
| "num_tokens": 235321857.0, | |
| "step": 898 | |
| }, | |
| { | |
| "entropy": 0.4173772633075714, | |
| "epoch": 44.95, | |
| "grad_norm": 9.17105770111084, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0414, | |
| "mean_token_accuracy": 0.9790310859680176, | |
| "num_tokens": 235583899.0, | |
| "step": 899 | |
| }, | |
| { | |
| "entropy": 0.4168033003807068, | |
| "epoch": 45.0, | |
| "grad_norm": 3.8450264930725098, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0437, | |
| "mean_token_accuracy": 0.9839246273040771, | |
| "num_tokens": 235845966.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_entropy": 0.4168623983860016, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8965053558349609, | |
| "eval_num_tokens": 235845966.0, | |
| "eval_runtime": 0.5778, | |
| "eval_samples_per_second": 432.681, | |
| "eval_steps_per_second": 1.731, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.41515421867370605, | |
| "epoch": 45.05, | |
| "grad_norm": 7.109200477600098, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0523, | |
| "mean_token_accuracy": 0.9788933396339417, | |
| "num_tokens": 236108015.0, | |
| "step": 901 | |
| }, | |
| { | |
| "entropy": 0.41663509607315063, | |
| "epoch": 45.1, | |
| "grad_norm": 10.517169952392578, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0341, | |
| "mean_token_accuracy": 0.9875898361206055, | |
| "num_tokens": 236370040.0, | |
| "step": 902 | |
| }, | |
| { | |
| "entropy": 0.42009973526000977, | |
| "epoch": 45.15, | |
| "grad_norm": 7.42244815826416, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0381, | |
| "mean_token_accuracy": 0.9828254580497742, | |
| "num_tokens": 236632075.0, | |
| "step": 903 | |
| }, | |
| { | |
| "entropy": 0.418070524930954, | |
| "epoch": 45.2, | |
| "grad_norm": 6.486195087432861, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0394, | |
| "mean_token_accuracy": 0.9808374643325806, | |
| "num_tokens": 236894110.0, | |
| "step": 904 | |
| }, | |
| { | |
| "entropy": 0.4179428815841675, | |
| "epoch": 45.25, | |
| "grad_norm": 4.680195331573486, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0365, | |
| "mean_token_accuracy": 0.9855072498321533, | |
| "num_tokens": 237156164.0, | |
| "step": 905 | |
| }, | |
| { | |
| "entropy": 0.41826122999191284, | |
| "epoch": 45.3, | |
| "grad_norm": 4.5584282875061035, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0345, | |
| "mean_token_accuracy": 0.9875816702842712, | |
| "num_tokens": 237418221.0, | |
| "step": 906 | |
| }, | |
| { | |
| "entropy": 0.41678982973098755, | |
| "epoch": 45.35, | |
| "grad_norm": 11.087577819824219, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0433, | |
| "mean_token_accuracy": 0.9833915829658508, | |
| "num_tokens": 237680288.0, | |
| "step": 907 | |
| }, | |
| { | |
| "entropy": 0.4179689586162567, | |
| "epoch": 45.4, | |
| "grad_norm": 3.9059412479400635, | |
| "learning_rate": 1e-06, | |
| "loss": 0.039, | |
| "mean_token_accuracy": 0.9850313663482666, | |
| "num_tokens": 237942325.0, | |
| "step": 908 | |
| }, | |
| { | |
| "entropy": 0.4192636013031006, | |
| "epoch": 45.45, | |
| "grad_norm": 10.241129875183105, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0458, | |
| "mean_token_accuracy": 0.9825620651245117, | |
| "num_tokens": 238204376.0, | |
| "step": 909 | |
| }, | |
| { | |
| "entropy": 0.41725245118141174, | |
| "epoch": 45.5, | |
| "grad_norm": 7.922611236572266, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0506, | |
| "mean_token_accuracy": 0.9770580530166626, | |
| "num_tokens": 238466385.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.41635823249816895, | |
| "epoch": 45.55, | |
| "grad_norm": 8.423656463623047, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0423, | |
| "mean_token_accuracy": 0.9865138530731201, | |
| "num_tokens": 238728428.0, | |
| "step": 911 | |
| }, | |
| { | |
| "entropy": 0.4160480499267578, | |
| "epoch": 45.6, | |
| "grad_norm": 5.975074768066406, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0459, | |
| "mean_token_accuracy": 0.9800771474838257, | |
| "num_tokens": 238990478.0, | |
| "step": 912 | |
| }, | |
| { | |
| "entropy": 0.4144827425479889, | |
| "epoch": 45.65, | |
| "grad_norm": 3.006824493408203, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0367, | |
| "mean_token_accuracy": 0.9852420091629028, | |
| "num_tokens": 239252534.0, | |
| "step": 913 | |
| }, | |
| { | |
| "entropy": 0.4175737500190735, | |
| "epoch": 45.7, | |
| "grad_norm": 4.676286697387695, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0336, | |
| "mean_token_accuracy": 0.9869791865348816, | |
| "num_tokens": 239514597.0, | |
| "step": 914 | |
| }, | |
| { | |
| "entropy": 0.4146580696105957, | |
| "epoch": 45.75, | |
| "grad_norm": 6.910285472869873, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0474, | |
| "mean_token_accuracy": 0.9800514578819275, | |
| "num_tokens": 239776645.0, | |
| "step": 915 | |
| }, | |
| { | |
| "entropy": 0.4156876802444458, | |
| "epoch": 45.8, | |
| "grad_norm": 11.429252624511719, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0488, | |
| "mean_token_accuracy": 0.9763739109039307, | |
| "num_tokens": 240038723.0, | |
| "step": 916 | |
| }, | |
| { | |
| "entropy": 0.4177379608154297, | |
| "epoch": 45.85, | |
| "grad_norm": 11.261126518249512, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0525, | |
| "mean_token_accuracy": 0.9810874462127686, | |
| "num_tokens": 240300777.0, | |
| "step": 917 | |
| }, | |
| { | |
| "entropy": 0.4161033630371094, | |
| "epoch": 45.9, | |
| "grad_norm": 7.90402364730835, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0458, | |
| "mean_token_accuracy": 0.9802880883216858, | |
| "num_tokens": 240562854.0, | |
| "step": 918 | |
| }, | |
| { | |
| "entropy": 0.4177625775337219, | |
| "epoch": 45.95, | |
| "grad_norm": 5.553004741668701, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0412, | |
| "mean_token_accuracy": 0.9828277230262756, | |
| "num_tokens": 240824930.0, | |
| "step": 919 | |
| }, | |
| { | |
| "entropy": 0.41690176725387573, | |
| "epoch": 46.0, | |
| "grad_norm": 4.296492099761963, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0408, | |
| "mean_token_accuracy": 0.9832713603973389, | |
| "num_tokens": 241086972.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_entropy": 0.41648760437965393, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9025537371635437, | |
| "eval_num_tokens": 241086972.0, | |
| "eval_runtime": 0.5516, | |
| "eval_samples_per_second": 453.233, | |
| "eval_steps_per_second": 1.813, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.41621556878089905, | |
| "epoch": 46.05, | |
| "grad_norm": 4.381577014923096, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0412, | |
| "mean_token_accuracy": 0.9823434948921204, | |
| "num_tokens": 241349005.0, | |
| "step": 921 | |
| }, | |
| { | |
| "entropy": 0.4166816174983978, | |
| "epoch": 46.1, | |
| "grad_norm": 5.124240875244141, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0355, | |
| "mean_token_accuracy": 0.9883086681365967, | |
| "num_tokens": 241611046.0, | |
| "step": 922 | |
| }, | |
| { | |
| "entropy": 0.4189669191837311, | |
| "epoch": 46.15, | |
| "grad_norm": 5.907957553863525, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0346, | |
| "mean_token_accuracy": 0.9852768182754517, | |
| "num_tokens": 241873106.0, | |
| "step": 923 | |
| }, | |
| { | |
| "entropy": 0.41633808612823486, | |
| "epoch": 46.2, | |
| "grad_norm": 7.788755893707275, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0462, | |
| "mean_token_accuracy": 0.9764492511749268, | |
| "num_tokens": 242135124.0, | |
| "step": 924 | |
| }, | |
| { | |
| "entropy": 0.4165252447128296, | |
| "epoch": 46.25, | |
| "grad_norm": 5.588308811187744, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0439, | |
| "mean_token_accuracy": 0.977729856967926, | |
| "num_tokens": 242397208.0, | |
| "step": 925 | |
| }, | |
| { | |
| "entropy": 0.4190508723258972, | |
| "epoch": 46.3, | |
| "grad_norm": 2.2610719203948975, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0379, | |
| "mean_token_accuracy": 0.9827175140380859, | |
| "num_tokens": 242659248.0, | |
| "step": 926 | |
| }, | |
| { | |
| "entropy": 0.4179667830467224, | |
| "epoch": 46.35, | |
| "grad_norm": 3.7483508586883545, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0403, | |
| "mean_token_accuracy": 0.9825544953346252, | |
| "num_tokens": 242921347.0, | |
| "step": 927 | |
| }, | |
| { | |
| "entropy": 0.41781550645828247, | |
| "epoch": 46.4, | |
| "grad_norm": 5.416518688201904, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0388, | |
| "mean_token_accuracy": 0.985497236251831, | |
| "num_tokens": 243183388.0, | |
| "step": 928 | |
| }, | |
| { | |
| "entropy": 0.41736748814582825, | |
| "epoch": 46.45, | |
| "grad_norm": 3.2504491806030273, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0352, | |
| "mean_token_accuracy": 0.9862027764320374, | |
| "num_tokens": 243445417.0, | |
| "step": 929 | |
| }, | |
| { | |
| "entropy": 0.41560643911361694, | |
| "epoch": 46.5, | |
| "grad_norm": 5.10044002532959, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0618, | |
| "mean_token_accuracy": 0.9725239872932434, | |
| "num_tokens": 243707476.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.41733497381210327, | |
| "epoch": 46.55, | |
| "grad_norm": 4.848858833312988, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0356, | |
| "mean_token_accuracy": 0.9850000143051147, | |
| "num_tokens": 243969504.0, | |
| "step": 931 | |
| }, | |
| { | |
| "entropy": 0.41563642024993896, | |
| "epoch": 46.6, | |
| "grad_norm": 8.018087387084961, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0373, | |
| "mean_token_accuracy": 0.9849973917007446, | |
| "num_tokens": 244231568.0, | |
| "step": 932 | |
| }, | |
| { | |
| "entropy": 0.415079265832901, | |
| "epoch": 46.65, | |
| "grad_norm": 4.895575523376465, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0393, | |
| "mean_token_accuracy": 0.9828532338142395, | |
| "num_tokens": 244493619.0, | |
| "step": 933 | |
| }, | |
| { | |
| "entropy": 0.4182128608226776, | |
| "epoch": 46.7, | |
| "grad_norm": 6.3751220703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0271, | |
| "mean_token_accuracy": 0.9895547032356262, | |
| "num_tokens": 244755635.0, | |
| "step": 934 | |
| }, | |
| { | |
| "entropy": 0.4158663749694824, | |
| "epoch": 46.75, | |
| "grad_norm": 4.839704513549805, | |
| "learning_rate": 1e-06, | |
| "loss": 0.05, | |
| "mean_token_accuracy": 0.9784736037254333, | |
| "num_tokens": 245017695.0, | |
| "step": 935 | |
| }, | |
| { | |
| "entropy": 0.4153270125389099, | |
| "epoch": 46.8, | |
| "grad_norm": 8.145550727844238, | |
| "learning_rate": 1e-06, | |
| "loss": 0.04, | |
| "mean_token_accuracy": 0.9837110638618469, | |
| "num_tokens": 245279766.0, | |
| "step": 936 | |
| }, | |
| { | |
| "entropy": 0.4160436987876892, | |
| "epoch": 46.85, | |
| "grad_norm": 3.469226598739624, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0365, | |
| "mean_token_accuracy": 0.9860582947731018, | |
| "num_tokens": 245541838.0, | |
| "step": 937 | |
| }, | |
| { | |
| "entropy": 0.4171416759490967, | |
| "epoch": 46.9, | |
| "grad_norm": 6.618907928466797, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0372, | |
| "mean_token_accuracy": 0.9837837815284729, | |
| "num_tokens": 245803865.0, | |
| "step": 938 | |
| }, | |
| { | |
| "entropy": 0.41581034660339355, | |
| "epoch": 46.95, | |
| "grad_norm": 3.842113494873047, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0429, | |
| "mean_token_accuracy": 0.9822601675987244, | |
| "num_tokens": 246065914.0, | |
| "step": 939 | |
| }, | |
| { | |
| "entropy": 0.4186379015445709, | |
| "epoch": 47.0, | |
| "grad_norm": 4.248569965362549, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0399, | |
| "mean_token_accuracy": 0.9806201457977295, | |
| "num_tokens": 246327983.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_entropy": 0.41853243112564087, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9012096524238586, | |
| "eval_num_tokens": 246327983.0, | |
| "eval_runtime": 0.5635, | |
| "eval_samples_per_second": 443.637, | |
| "eval_steps_per_second": 1.775, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.41847890615463257, | |
| "epoch": 47.05, | |
| "grad_norm": 2.8673055171966553, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0353, | |
| "mean_token_accuracy": 0.9876089692115784, | |
| "num_tokens": 246590029.0, | |
| "step": 941 | |
| }, | |
| { | |
| "entropy": 0.4161589741706848, | |
| "epoch": 47.1, | |
| "grad_norm": 4.205740928649902, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0353, | |
| "mean_token_accuracy": 0.9830721020698547, | |
| "num_tokens": 246852085.0, | |
| "step": 942 | |
| }, | |
| { | |
| "entropy": 0.4160376787185669, | |
| "epoch": 47.15, | |
| "grad_norm": 6.36973237991333, | |
| "learning_rate": 1e-06, | |
| "loss": 0.032, | |
| "mean_token_accuracy": 0.9876118302345276, | |
| "num_tokens": 247114098.0, | |
| "step": 943 | |
| }, | |
| { | |
| "entropy": 0.4143107831478119, | |
| "epoch": 47.2, | |
| "grad_norm": 4.71365213394165, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0306, | |
| "mean_token_accuracy": 0.9879518151283264, | |
| "num_tokens": 247376171.0, | |
| "step": 944 | |
| }, | |
| { | |
| "entropy": 0.41483235359191895, | |
| "epoch": 47.25, | |
| "grad_norm": 3.322777509689331, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0476, | |
| "mean_token_accuracy": 0.9799240231513977, | |
| "num_tokens": 247638211.0, | |
| "step": 945 | |
| }, | |
| { | |
| "entropy": 0.41483980417251587, | |
| "epoch": 47.3, | |
| "grad_norm": 2.933046817779541, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0323, | |
| "mean_token_accuracy": 0.986775815486908, | |
| "num_tokens": 247900260.0, | |
| "step": 946 | |
| }, | |
| { | |
| "entropy": 0.4141683280467987, | |
| "epoch": 47.35, | |
| "grad_norm": 8.35169506072998, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0336, | |
| "mean_token_accuracy": 0.9878836870193481, | |
| "num_tokens": 248162322.0, | |
| "step": 947 | |
| }, | |
| { | |
| "entropy": 0.41416099667549133, | |
| "epoch": 47.4, | |
| "grad_norm": 8.373310089111328, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0417, | |
| "mean_token_accuracy": 0.9806157350540161, | |
| "num_tokens": 248424339.0, | |
| "step": 948 | |
| }, | |
| { | |
| "entropy": 0.414173424243927, | |
| "epoch": 47.45, | |
| "grad_norm": 4.492981433868408, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0282, | |
| "mean_token_accuracy": 0.9873417615890503, | |
| "num_tokens": 248686380.0, | |
| "step": 949 | |
| }, | |
| { | |
| "entropy": 0.4131871461868286, | |
| "epoch": 47.5, | |
| "grad_norm": 3.0811541080474854, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0426, | |
| "mean_token_accuracy": 0.9823922514915466, | |
| "num_tokens": 248948455.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.417308509349823, | |
| "epoch": 47.55, | |
| "grad_norm": 4.044399738311768, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0418, | |
| "mean_token_accuracy": 0.9816828966140747, | |
| "num_tokens": 249210498.0, | |
| "step": 951 | |
| }, | |
| { | |
| "entropy": 0.4158850908279419, | |
| "epoch": 47.6, | |
| "grad_norm": 4.268923759460449, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0306, | |
| "mean_token_accuracy": 0.9876543283462524, | |
| "num_tokens": 249472492.0, | |
| "step": 952 | |
| }, | |
| { | |
| "entropy": 0.41297072172164917, | |
| "epoch": 47.65, | |
| "grad_norm": 8.944028854370117, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0344, | |
| "mean_token_accuracy": 0.9835164546966553, | |
| "num_tokens": 249734524.0, | |
| "step": 953 | |
| }, | |
| { | |
| "entropy": 0.41369563341140747, | |
| "epoch": 47.7, | |
| "grad_norm": 11.018954277038574, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0594, | |
| "mean_token_accuracy": 0.9824694991111755, | |
| "num_tokens": 249996594.0, | |
| "step": 954 | |
| }, | |
| { | |
| "entropy": 0.412643164396286, | |
| "epoch": 47.75, | |
| "grad_norm": 12.327390670776367, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0443, | |
| "mean_token_accuracy": 0.9781690239906311, | |
| "num_tokens": 250258673.0, | |
| "step": 955 | |
| }, | |
| { | |
| "entropy": 0.41407012939453125, | |
| "epoch": 47.8, | |
| "grad_norm": 7.58923864364624, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0414, | |
| "mean_token_accuracy": 0.9808841347694397, | |
| "num_tokens": 250520742.0, | |
| "step": 956 | |
| }, | |
| { | |
| "entropy": 0.4143466055393219, | |
| "epoch": 47.85, | |
| "grad_norm": 5.423033237457275, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0429, | |
| "mean_token_accuracy": 0.9812453389167786, | |
| "num_tokens": 250782800.0, | |
| "step": 957 | |
| }, | |
| { | |
| "entropy": 0.41478925943374634, | |
| "epoch": 47.9, | |
| "grad_norm": 7.479618072509766, | |
| "learning_rate": 1e-06, | |
| "loss": 0.041, | |
| "mean_token_accuracy": 0.9829221963882446, | |
| "num_tokens": 251044842.0, | |
| "step": 958 | |
| }, | |
| { | |
| "entropy": 0.4130868911743164, | |
| "epoch": 47.95, | |
| "grad_norm": 3.688286304473877, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0472, | |
| "mean_token_accuracy": 0.9771689772605896, | |
| "num_tokens": 251306935.0, | |
| "step": 959 | |
| }, | |
| { | |
| "entropy": 0.41515159606933594, | |
| "epoch": 48.0, | |
| "grad_norm": 8.519615173339844, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0442, | |
| "mean_token_accuracy": 0.9841463565826416, | |
| "num_tokens": 251569003.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_entropy": 0.41626524925231934, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8985214829444885, | |
| "eval_num_tokens": 251569003.0, | |
| "eval_runtime": 0.5603, | |
| "eval_samples_per_second": 446.208, | |
| "eval_steps_per_second": 1.785, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.4145430326461792, | |
| "epoch": 48.05, | |
| "grad_norm": 6.435347080230713, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0393, | |
| "mean_token_accuracy": 0.9809423089027405, | |
| "num_tokens": 251831056.0, | |
| "step": 961 | |
| }, | |
| { | |
| "entropy": 0.4150584936141968, | |
| "epoch": 48.1, | |
| "grad_norm": 7.255335807800293, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0434, | |
| "mean_token_accuracy": 0.9832116961479187, | |
| "num_tokens": 252093151.0, | |
| "step": 962 | |
| }, | |
| { | |
| "entropy": 0.4166991114616394, | |
| "epoch": 48.15, | |
| "grad_norm": 6.78645133972168, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0513, | |
| "mean_token_accuracy": 0.9758485555648804, | |
| "num_tokens": 252355210.0, | |
| "step": 963 | |
| }, | |
| { | |
| "entropy": 0.4148341417312622, | |
| "epoch": 48.2, | |
| "grad_norm": 15.08552074432373, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0494, | |
| "mean_token_accuracy": 0.9825581312179565, | |
| "num_tokens": 252617271.0, | |
| "step": 964 | |
| }, | |
| { | |
| "entropy": 0.4165031313896179, | |
| "epoch": 48.25, | |
| "grad_norm": 3.8147826194763184, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0387, | |
| "mean_token_accuracy": 0.9838601350784302, | |
| "num_tokens": 252879318.0, | |
| "step": 965 | |
| }, | |
| { | |
| "entropy": 0.41597020626068115, | |
| "epoch": 48.3, | |
| "grad_norm": 3.696824550628662, | |
| "learning_rate": 1e-06, | |
| "loss": 0.033, | |
| "mean_token_accuracy": 0.9860774874687195, | |
| "num_tokens": 253141332.0, | |
| "step": 966 | |
| }, | |
| { | |
| "entropy": 0.4169830083847046, | |
| "epoch": 48.35, | |
| "grad_norm": 4.46604061126709, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0322, | |
| "mean_token_accuracy": 0.9871951341629028, | |
| "num_tokens": 253403367.0, | |
| "step": 967 | |
| }, | |
| { | |
| "entropy": 0.4176675081253052, | |
| "epoch": 48.4, | |
| "grad_norm": 9.332520484924316, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0479, | |
| "mean_token_accuracy": 0.978586733341217, | |
| "num_tokens": 253665460.0, | |
| "step": 968 | |
| }, | |
| { | |
| "entropy": 0.4181513786315918, | |
| "epoch": 48.45, | |
| "grad_norm": 3.3263468742370605, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0399, | |
| "mean_token_accuracy": 0.9831246733665466, | |
| "num_tokens": 253927527.0, | |
| "step": 969 | |
| }, | |
| { | |
| "entropy": 0.415880024433136, | |
| "epoch": 48.5, | |
| "grad_norm": 6.814778804779053, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0372, | |
| "mean_token_accuracy": 0.983660101890564, | |
| "num_tokens": 254189584.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 0.4180268943309784, | |
| "epoch": 48.55, | |
| "grad_norm": 5.279390335083008, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0286, | |
| "mean_token_accuracy": 0.9860907793045044, | |
| "num_tokens": 254451675.0, | |
| "step": 971 | |
| }, | |
| { | |
| "entropy": 0.41518670320510864, | |
| "epoch": 48.6, | |
| "grad_norm": 4.843584060668945, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0416, | |
| "mean_token_accuracy": 0.9814371466636658, | |
| "num_tokens": 254713674.0, | |
| "step": 972 | |
| }, | |
| { | |
| "entropy": 0.41497260332107544, | |
| "epoch": 48.65, | |
| "grad_norm": 4.6544036865234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0347, | |
| "mean_token_accuracy": 0.9851101636886597, | |
| "num_tokens": 254975748.0, | |
| "step": 973 | |
| }, | |
| { | |
| "entropy": 0.41487622261047363, | |
| "epoch": 48.7, | |
| "grad_norm": 11.355942726135254, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0363, | |
| "mean_token_accuracy": 0.9817721247673035, | |
| "num_tokens": 255237755.0, | |
| "step": 974 | |
| }, | |
| { | |
| "entropy": 0.4146149754524231, | |
| "epoch": 48.75, | |
| "grad_norm": 5.135185241699219, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0368, | |
| "mean_token_accuracy": 0.9845132827758789, | |
| "num_tokens": 255499803.0, | |
| "step": 975 | |
| }, | |
| { | |
| "entropy": 0.4158018231391907, | |
| "epoch": 48.8, | |
| "grad_norm": 9.837135314941406, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0466, | |
| "mean_token_accuracy": 0.98525071144104, | |
| "num_tokens": 255761851.0, | |
| "step": 976 | |
| }, | |
| { | |
| "entropy": 0.41550880670547485, | |
| "epoch": 48.85, | |
| "grad_norm": 2.997006893157959, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0314, | |
| "mean_token_accuracy": 0.9878566861152649, | |
| "num_tokens": 256023893.0, | |
| "step": 977 | |
| }, | |
| { | |
| "entropy": 0.4196978807449341, | |
| "epoch": 48.9, | |
| "grad_norm": 4.659507751464844, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0395, | |
| "mean_token_accuracy": 0.9817137122154236, | |
| "num_tokens": 256285938.0, | |
| "step": 978 | |
| }, | |
| { | |
| "entropy": 0.4174753427505493, | |
| "epoch": 48.95, | |
| "grad_norm": 7.701611518859863, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0444, | |
| "mean_token_accuracy": 0.9807291626930237, | |
| "num_tokens": 256547989.0, | |
| "step": 979 | |
| }, | |
| { | |
| "entropy": 0.41477835178375244, | |
| "epoch": 49.0, | |
| "grad_norm": 5.701596260070801, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0334, | |
| "mean_token_accuracy": 0.9884169697761536, | |
| "num_tokens": 256810037.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_entropy": 0.4165271818637848, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.9005376100540161, | |
| "eval_num_tokens": 256810037.0, | |
| "eval_runtime": 0.5618, | |
| "eval_samples_per_second": 445.006, | |
| "eval_steps_per_second": 1.78, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.4151040017604828, | |
| "epoch": 49.05, | |
| "grad_norm": 7.52377462387085, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0405, | |
| "mean_token_accuracy": 0.983753502368927, | |
| "num_tokens": 257072085.0, | |
| "step": 981 | |
| }, | |
| { | |
| "entropy": 0.41499486565589905, | |
| "epoch": 49.1, | |
| "grad_norm": 5.606761455535889, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0359, | |
| "mean_token_accuracy": 0.9828641414642334, | |
| "num_tokens": 257334114.0, | |
| "step": 982 | |
| }, | |
| { | |
| "entropy": 0.4133911728858948, | |
| "epoch": 49.15, | |
| "grad_norm": 3.4444432258605957, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0296, | |
| "mean_token_accuracy": 0.9844412803649902, | |
| "num_tokens": 257596187.0, | |
| "step": 983 | |
| }, | |
| { | |
| "entropy": 0.41423529386520386, | |
| "epoch": 49.2, | |
| "grad_norm": 4.566822052001953, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0335, | |
| "mean_token_accuracy": 0.9858490824699402, | |
| "num_tokens": 257858264.0, | |
| "step": 984 | |
| }, | |
| { | |
| "entropy": 0.41284215450286865, | |
| "epoch": 49.25, | |
| "grad_norm": 10.468485832214355, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0444, | |
| "mean_token_accuracy": 0.9830007553100586, | |
| "num_tokens": 258120309.0, | |
| "step": 985 | |
| }, | |
| { | |
| "entropy": 0.41363033652305603, | |
| "epoch": 49.3, | |
| "grad_norm": 5.765902996063232, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0312, | |
| "mean_token_accuracy": 0.985897421836853, | |
| "num_tokens": 258382396.0, | |
| "step": 986 | |
| }, | |
| { | |
| "entropy": 0.4127556085586548, | |
| "epoch": 49.35, | |
| "grad_norm": 7.179581165313721, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0405, | |
| "mean_token_accuracy": 0.9815910458564758, | |
| "num_tokens": 258644477.0, | |
| "step": 987 | |
| }, | |
| { | |
| "entropy": 0.41511815786361694, | |
| "epoch": 49.4, | |
| "grad_norm": 8.856917381286621, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0415, | |
| "mean_token_accuracy": 0.9796854257583618, | |
| "num_tokens": 258906530.0, | |
| "step": 988 | |
| }, | |
| { | |
| "entropy": 0.41756629943847656, | |
| "epoch": 49.45, | |
| "grad_norm": 5.423385143280029, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0357, | |
| "mean_token_accuracy": 0.9856938719749451, | |
| "num_tokens": 259168554.0, | |
| "step": 989 | |
| }, | |
| { | |
| "entropy": 0.41290774941444397, | |
| "epoch": 49.5, | |
| "grad_norm": 7.986942768096924, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0399, | |
| "mean_token_accuracy": 0.9850448369979858, | |
| "num_tokens": 259430625.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 0.4130344092845917, | |
| "epoch": 49.55, | |
| "grad_norm": 5.943664073944092, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0351, | |
| "mean_token_accuracy": 0.9877350926399231, | |
| "num_tokens": 259692705.0, | |
| "step": 991 | |
| }, | |
| { | |
| "entropy": 0.4145318269729614, | |
| "epoch": 49.6, | |
| "grad_norm": 6.782146453857422, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0332, | |
| "mean_token_accuracy": 0.9839532971382141, | |
| "num_tokens": 259954702.0, | |
| "step": 992 | |
| }, | |
| { | |
| "entropy": 0.41375476121902466, | |
| "epoch": 49.65, | |
| "grad_norm": 9.113283157348633, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0471, | |
| "mean_token_accuracy": 0.9787600636482239, | |
| "num_tokens": 260216740.0, | |
| "step": 993 | |
| }, | |
| { | |
| "entropy": 0.41189008951187134, | |
| "epoch": 49.7, | |
| "grad_norm": 10.127237319946289, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0328, | |
| "mean_token_accuracy": 0.9894217252731323, | |
| "num_tokens": 260478817.0, | |
| "step": 994 | |
| }, | |
| { | |
| "entropy": 0.4134674072265625, | |
| "epoch": 49.75, | |
| "grad_norm": 3.2917778491973877, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0389, | |
| "mean_token_accuracy": 0.9848576784133911, | |
| "num_tokens": 260740863.0, | |
| "step": 995 | |
| }, | |
| { | |
| "entropy": 0.4141203761100769, | |
| "epoch": 49.8, | |
| "grad_norm": 12.72966194152832, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0411, | |
| "mean_token_accuracy": 0.9854904413223267, | |
| "num_tokens": 261002915.0, | |
| "step": 996 | |
| }, | |
| { | |
| "entropy": 0.41340839862823486, | |
| "epoch": 49.85, | |
| "grad_norm": 5.995748519897461, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0478, | |
| "mean_token_accuracy": 0.9802231192588806, | |
| "num_tokens": 261264952.0, | |
| "step": 997 | |
| }, | |
| { | |
| "entropy": 0.4108712673187256, | |
| "epoch": 49.9, | |
| "grad_norm": 2.843021869659424, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0386, | |
| "mean_token_accuracy": 0.9838079214096069, | |
| "num_tokens": 261526973.0, | |
| "step": 998 | |
| }, | |
| { | |
| "entropy": 0.41318273544311523, | |
| "epoch": 49.95, | |
| "grad_norm": 5.03126859664917, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0338, | |
| "mean_token_accuracy": 0.982284665107727, | |
| "num_tokens": 261789005.0, | |
| "step": 999 | |
| }, | |
| { | |
| "entropy": 0.41257742047309875, | |
| "epoch": 50.0, | |
| "grad_norm": 6.049678802490234, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0375, | |
| "mean_token_accuracy": 0.9856985807418823, | |
| "num_tokens": 262051053.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_entropy": 0.41491925716400146, | |
| "eval_loss": NaN, | |
| "eval_mean_token_accuracy": 0.8998655676841736, | |
| "eval_num_tokens": 262051053.0, | |
| "eval_runtime": 0.5646, | |
| "eval_samples_per_second": 442.828, | |
| "eval_steps_per_second": 1.771, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "step": 1000, | |
| "total_flos": 227901702144000.0, | |
| "train_loss": 0.14342150183208285, | |
| "train_runtime": 3480.9498, | |
| "train_samples_per_second": 71.819, | |
| "train_steps_per_second": 0.287 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 227901702144000.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |