| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.0, | |
| "eval_steps": 500, | |
| "global_step": 2328, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 4.914482545852661, | |
| "epoch": 0.02577319587628866, | |
| "grad_norm": 232.0, | |
| "learning_rate": 1.9313304721030046e-07, | |
| "loss": 5.5997, | |
| "mean_token_accuracy": 0.20726535096764565, | |
| "num_tokens": 266.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 4.538765811920166, | |
| "epoch": 0.05154639175257732, | |
| "grad_norm": 160.0, | |
| "learning_rate": 4.07725321888412e-07, | |
| "loss": 5.4934, | |
| "mean_token_accuracy": 0.21887856498360633, | |
| "num_tokens": 627.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 4.939351224899292, | |
| "epoch": 0.07731958762886598, | |
| "grad_norm": 118.5, | |
| "learning_rate": 6.223175965665236e-07, | |
| "loss": 5.7523, | |
| "mean_token_accuracy": 0.1917542487382889, | |
| "num_tokens": 926.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 4.698552560806275, | |
| "epoch": 0.10309278350515463, | |
| "grad_norm": 187.0, | |
| "learning_rate": 8.369098712446352e-07, | |
| "loss": 5.1204, | |
| "mean_token_accuracy": 0.22331946194171906, | |
| "num_tokens": 1259.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 4.72123908996582, | |
| "epoch": 0.12886597938144329, | |
| "grad_norm": 166.0, | |
| "learning_rate": 1.051502145922747e-06, | |
| "loss": 5.198, | |
| "mean_token_accuracy": 0.21775908395648003, | |
| "num_tokens": 1550.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 4.719539976119995, | |
| "epoch": 0.15463917525773196, | |
| "grad_norm": 200.0, | |
| "learning_rate": 1.2660944206008586e-06, | |
| "loss": 5.0832, | |
| "mean_token_accuracy": 0.2405412092804909, | |
| "num_tokens": 1839.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 4.78384747505188, | |
| "epoch": 0.18041237113402062, | |
| "grad_norm": 149.0, | |
| "learning_rate": 1.48068669527897e-06, | |
| "loss": 5.0961, | |
| "mean_token_accuracy": 0.19893446192145348, | |
| "num_tokens": 2171.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 4.974851608276367, | |
| "epoch": 0.20618556701030927, | |
| "grad_norm": 177.0, | |
| "learning_rate": 1.6952789699570817e-06, | |
| "loss": 5.2262, | |
| "mean_token_accuracy": 0.20617640018463135, | |
| "num_tokens": 2450.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 4.705893039703369, | |
| "epoch": 0.23195876288659795, | |
| "grad_norm": 182.0, | |
| "learning_rate": 1.9098712446351934e-06, | |
| "loss": 4.8345, | |
| "mean_token_accuracy": 0.22286981157958508, | |
| "num_tokens": 2786.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 4.696354675292969, | |
| "epoch": 0.25773195876288657, | |
| "grad_norm": 154.0, | |
| "learning_rate": 2.124463519313305e-06, | |
| "loss": 5.023, | |
| "mean_token_accuracy": 0.25020611882209776, | |
| "num_tokens": 3090.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 4.626954746246338, | |
| "epoch": 0.28350515463917525, | |
| "grad_norm": 59.0, | |
| "learning_rate": 2.3390557939914167e-06, | |
| "loss": 4.5576, | |
| "mean_token_accuracy": 0.2621785670518875, | |
| "num_tokens": 3383.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 4.682432174682617, | |
| "epoch": 0.30927835051546393, | |
| "grad_norm": 76.0, | |
| "learning_rate": 2.553648068669528e-06, | |
| "loss": 4.792, | |
| "mean_token_accuracy": 0.26012246310710907, | |
| "num_tokens": 3650.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 4.563408708572387, | |
| "epoch": 0.33505154639175255, | |
| "grad_norm": 43.25, | |
| "learning_rate": 2.7682403433476396e-06, | |
| "loss": 4.5349, | |
| "mean_token_accuracy": 0.22586977183818818, | |
| "num_tokens": 3990.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 4.270893955230713, | |
| "epoch": 0.36082474226804123, | |
| "grad_norm": 65.0, | |
| "learning_rate": 2.982832618025751e-06, | |
| "loss": 4.147, | |
| "mean_token_accuracy": 0.28786555826663973, | |
| "num_tokens": 4345.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 4.0949385404586796, | |
| "epoch": 0.3865979381443299, | |
| "grad_norm": 55.75, | |
| "learning_rate": 3.197424892703863e-06, | |
| "loss": 3.7088, | |
| "mean_token_accuracy": 0.34470676481723783, | |
| "num_tokens": 4601.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 4.128937864303589, | |
| "epoch": 0.41237113402061853, | |
| "grad_norm": 49.25, | |
| "learning_rate": 3.412017167381975e-06, | |
| "loss": 3.9797, | |
| "mean_token_accuracy": 0.3258319616317749, | |
| "num_tokens": 5024.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 3.8451520442962646, | |
| "epoch": 0.4381443298969072, | |
| "grad_norm": 72.5, | |
| "learning_rate": 3.6266094420600863e-06, | |
| "loss": 3.7548, | |
| "mean_token_accuracy": 0.3747887283563614, | |
| "num_tokens": 5311.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 3.712318539619446, | |
| "epoch": 0.4639175257731959, | |
| "grad_norm": 44.0, | |
| "learning_rate": 3.841201716738197e-06, | |
| "loss": 3.6765, | |
| "mean_token_accuracy": 0.3758635461330414, | |
| "num_tokens": 5651.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 3.763609743118286, | |
| "epoch": 0.4896907216494845, | |
| "grad_norm": 43.25, | |
| "learning_rate": 4.055793991416309e-06, | |
| "loss": 3.6278, | |
| "mean_token_accuracy": 0.3873393088579178, | |
| "num_tokens": 5983.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 3.781333661079407, | |
| "epoch": 0.5154639175257731, | |
| "grad_norm": 67.0, | |
| "learning_rate": 4.270386266094421e-06, | |
| "loss": 3.7018, | |
| "mean_token_accuracy": 0.3747037798166275, | |
| "num_tokens": 6296.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 3.6542357921600344, | |
| "epoch": 0.5412371134020618, | |
| "grad_norm": 63.75, | |
| "learning_rate": 4.484978540772533e-06, | |
| "loss": 3.2866, | |
| "mean_token_accuracy": 0.4405659481883049, | |
| "num_tokens": 6658.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 3.38469135761261, | |
| "epoch": 0.5670103092783505, | |
| "grad_norm": 88.0, | |
| "learning_rate": 4.699570815450644e-06, | |
| "loss": 3.1957, | |
| "mean_token_accuracy": 0.41826934069395066, | |
| "num_tokens": 6997.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 3.9692420244216917, | |
| "epoch": 0.5927835051546392, | |
| "grad_norm": 42.5, | |
| "learning_rate": 4.914163090128756e-06, | |
| "loss": 4.0617, | |
| "mean_token_accuracy": 0.3779368013143539, | |
| "num_tokens": 7376.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 3.725129175186157, | |
| "epoch": 0.6185567010309279, | |
| "grad_norm": 49.5, | |
| "learning_rate": 4.999898809142829e-06, | |
| "loss": 3.3293, | |
| "mean_token_accuracy": 0.4062195152044296, | |
| "num_tokens": 7688.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 3.673329973220825, | |
| "epoch": 0.6443298969072165, | |
| "grad_norm": 84.5, | |
| "learning_rate": 4.9992804502362914e-06, | |
| "loss": 3.5039, | |
| "mean_token_accuracy": 0.41747846007347106, | |
| "num_tokens": 7988.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 3.5831470012664797, | |
| "epoch": 0.6701030927835051, | |
| "grad_norm": 79.0, | |
| "learning_rate": 4.998100088445351e-06, | |
| "loss": 3.3134, | |
| "mean_token_accuracy": 0.4633125364780426, | |
| "num_tokens": 8263.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 3.5552636861801146, | |
| "epoch": 0.6958762886597938, | |
| "grad_norm": 65.0, | |
| "learning_rate": 4.996357989193094e-06, | |
| "loss": 3.4462, | |
| "mean_token_accuracy": 0.40612466633319855, | |
| "num_tokens": 8590.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 3.833337092399597, | |
| "epoch": 0.7216494845360825, | |
| "grad_norm": 39.5, | |
| "learning_rate": 4.994054544218193e-06, | |
| "loss": 3.7615, | |
| "mean_token_accuracy": 0.3914962366223335, | |
| "num_tokens": 8945.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 3.6764284133911134, | |
| "epoch": 0.7474226804123711, | |
| "grad_norm": 60.25, | |
| "learning_rate": 4.991190271486816e-06, | |
| "loss": 3.4472, | |
| "mean_token_accuracy": 0.4174343138933182, | |
| "num_tokens": 9269.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 3.5497223377227782, | |
| "epoch": 0.7731958762886598, | |
| "grad_norm": 76.0, | |
| "learning_rate": 4.987765815076157e-06, | |
| "loss": 3.1154, | |
| "mean_token_accuracy": 0.44447133839130404, | |
| "num_tokens": 9550.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 3.8354516506195067, | |
| "epoch": 0.7989690721649485, | |
| "grad_norm": 58.25, | |
| "learning_rate": 4.9837819450296e-06, | |
| "loss": 3.4305, | |
| "mean_token_accuracy": 0.42582335472106936, | |
| "num_tokens": 9899.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 3.752107357978821, | |
| "epoch": 0.8247422680412371, | |
| "grad_norm": 56.5, | |
| "learning_rate": 4.979239557183571e-06, | |
| "loss": 3.6763, | |
| "mean_token_accuracy": 0.4198161542415619, | |
| "num_tokens": 10174.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 3.641463875770569, | |
| "epoch": 0.8505154639175257, | |
| "grad_norm": 61.25, | |
| "learning_rate": 4.974139672966082e-06, | |
| "loss": 3.5237, | |
| "mean_token_accuracy": 0.3649302959442139, | |
| "num_tokens": 10520.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 3.487735724449158, | |
| "epoch": 0.8762886597938144, | |
| "grad_norm": 55.0, | |
| "learning_rate": 4.968483439167061e-06, | |
| "loss": 3.3832, | |
| "mean_token_accuracy": 0.4202912449836731, | |
| "num_tokens": 10834.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 3.659078526496887, | |
| "epoch": 0.9020618556701031, | |
| "grad_norm": 58.0, | |
| "learning_rate": 4.9622721276804674e-06, | |
| "loss": 3.0981, | |
| "mean_token_accuracy": 0.45194968581199646, | |
| "num_tokens": 11110.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 3.3279967069625855, | |
| "epoch": 0.9278350515463918, | |
| "grad_norm": 81.5, | |
| "learning_rate": 4.955507135218291e-06, | |
| "loss": 3.2825, | |
| "mean_token_accuracy": 0.42670700550079343, | |
| "num_tokens": 11410.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 3.2990634202957154, | |
| "epoch": 0.9536082474226805, | |
| "grad_norm": 52.25, | |
| "learning_rate": 4.948189982996479e-06, | |
| "loss": 2.945, | |
| "mean_token_accuracy": 0.46619434356689454, | |
| "num_tokens": 11758.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 3.442564105987549, | |
| "epoch": 0.979381443298969, | |
| "grad_norm": 77.0, | |
| "learning_rate": 4.940322316392865e-06, | |
| "loss": 3.4907, | |
| "mean_token_accuracy": 0.465116411447525, | |
| "num_tokens": 12008.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 3.4102420568466187, | |
| "epoch": 1.0051546391752577, | |
| "grad_norm": 105.5, | |
| "learning_rate": 4.931905904577182e-06, | |
| "loss": 3.4858, | |
| "mean_token_accuracy": 0.4123460859060287, | |
| "num_tokens": 12288.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 3.615120697021484, | |
| "epoch": 1.0309278350515463, | |
| "grad_norm": 57.75, | |
| "learning_rate": 4.922942640113234e-06, | |
| "loss": 3.1624, | |
| "mean_token_accuracy": 0.45850674211978915, | |
| "num_tokens": 12600.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 3.5458458185195925, | |
| "epoch": 1.056701030927835, | |
| "grad_norm": 64.5, | |
| "learning_rate": 4.913434538533324e-06, | |
| "loss": 3.3334, | |
| "mean_token_accuracy": 0.46345340013504027, | |
| "num_tokens": 12888.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 3.283875823020935, | |
| "epoch": 1.0824742268041236, | |
| "grad_norm": 84.5, | |
| "learning_rate": 4.90338373788503e-06, | |
| "loss": 3.2497, | |
| "mean_token_accuracy": 0.47709590196609497, | |
| "num_tokens": 13180.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 3.4606318950653074, | |
| "epoch": 1.1082474226804124, | |
| "grad_norm": 72.0, | |
| "learning_rate": 4.892792498250431e-06, | |
| "loss": 3.5329, | |
| "mean_token_accuracy": 0.4233353078365326, | |
| "num_tokens": 13485.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 3.3683727979660034, | |
| "epoch": 1.134020618556701, | |
| "grad_norm": 58.25, | |
| "learning_rate": 4.881663201237889e-06, | |
| "loss": 3.0689, | |
| "mean_token_accuracy": 0.4563048958778381, | |
| "num_tokens": 13849.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 3.3520001411437987, | |
| "epoch": 1.1597938144329896, | |
| "grad_norm": 49.5, | |
| "learning_rate": 4.869998349446514e-06, | |
| "loss": 3.0789, | |
| "mean_token_accuracy": 0.4614581674337387, | |
| "num_tokens": 14121.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 3.3299176692962646, | |
| "epoch": 1.1855670103092784, | |
| "grad_norm": 64.5, | |
| "learning_rate": 4.857800565903405e-06, | |
| "loss": 3.2253, | |
| "mean_token_accuracy": 0.4312807470560074, | |
| "num_tokens": 14503.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 3.6635308504104613, | |
| "epoch": 1.211340206185567, | |
| "grad_norm": 52.5, | |
| "learning_rate": 4.845072593473826e-06, | |
| "loss": 3.2584, | |
| "mean_token_accuracy": 0.46550854444503786, | |
| "num_tokens": 14792.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 3.2862602949142454, | |
| "epoch": 1.2371134020618557, | |
| "grad_norm": 56.25, | |
| "learning_rate": 4.831817294244432e-06, | |
| "loss": 3.1376, | |
| "mean_token_accuracy": 0.4864483565092087, | |
| "num_tokens": 15071.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 3.4670738697052004, | |
| "epoch": 1.2628865979381443, | |
| "grad_norm": 60.25, | |
| "learning_rate": 4.8180376488796755e-06, | |
| "loss": 3.4598, | |
| "mean_token_accuracy": 0.38327425718307495, | |
| "num_tokens": 15340.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 3.4762473344802856, | |
| "epoch": 1.2886597938144329, | |
| "grad_norm": 72.0, | |
| "learning_rate": 4.803736755951564e-06, | |
| "loss": 3.23, | |
| "mean_token_accuracy": 0.4640804290771484, | |
| "num_tokens": 15658.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 3.4152861833572388, | |
| "epoch": 1.3144329896907216, | |
| "grad_norm": 60.5, | |
| "learning_rate": 4.788917831242895e-06, | |
| "loss": 3.1239, | |
| "mean_token_accuracy": 0.47079411447048186, | |
| "num_tokens": 15943.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 3.3919612884521486, | |
| "epoch": 1.3402061855670104, | |
| "grad_norm": 116.5, | |
| "learning_rate": 4.773584207024135e-06, | |
| "loss": 3.193, | |
| "mean_token_accuracy": 0.40636845529079435, | |
| "num_tokens": 16268.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 3.520834803581238, | |
| "epoch": 1.365979381443299, | |
| "grad_norm": 67.5, | |
| "learning_rate": 4.7577393313041025e-06, | |
| "loss": 3.3889, | |
| "mean_token_accuracy": 0.4428008824586868, | |
| "num_tokens": 16611.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 3.5736474275588987, | |
| "epoch": 1.3917525773195876, | |
| "grad_norm": 51.25, | |
| "learning_rate": 4.741386767054636e-06, | |
| "loss": 3.4062, | |
| "mean_token_accuracy": 0.408548378944397, | |
| "num_tokens": 16917.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 3.4252942323684694, | |
| "epoch": 1.4175257731958764, | |
| "grad_norm": 66.5, | |
| "learning_rate": 4.724530191409399e-06, | |
| "loss": 3.2558, | |
| "mean_token_accuracy": 0.4369684547185898, | |
| "num_tokens": 17283.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 3.335077738761902, | |
| "epoch": 1.443298969072165, | |
| "grad_norm": 57.25, | |
| "learning_rate": 4.707173394837017e-06, | |
| "loss": 3.157, | |
| "mean_token_accuracy": 0.4764990329742432, | |
| "num_tokens": 17604.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 3.519501209259033, | |
| "epoch": 1.4690721649484537, | |
| "grad_norm": 57.5, | |
| "learning_rate": 4.689320280288731e-06, | |
| "loss": 3.4428, | |
| "mean_token_accuracy": 0.4320195406675339, | |
| "num_tokens": 17939.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 3.5304367780685424, | |
| "epoch": 1.4948453608247423, | |
| "grad_norm": 55.0, | |
| "learning_rate": 4.67097486232076e-06, | |
| "loss": 3.2563, | |
| "mean_token_accuracy": 0.4069958388805389, | |
| "num_tokens": 18234.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 3.662277579307556, | |
| "epoch": 1.5206185567010309, | |
| "grad_norm": 67.0, | |
| "learning_rate": 4.65214126619156e-06, | |
| "loss": 3.8052, | |
| "mean_token_accuracy": 0.40810766220092776, | |
| "num_tokens": 18574.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 3.2554187536239625, | |
| "epoch": 1.5463917525773194, | |
| "grad_norm": 69.0, | |
| "learning_rate": 4.632823726934199e-06, | |
| "loss": 3.0088, | |
| "mean_token_accuracy": 0.4515380173921585, | |
| "num_tokens": 18886.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 3.4018918752670286, | |
| "epoch": 1.5721649484536082, | |
| "grad_norm": 60.25, | |
| "learning_rate": 4.613026588404036e-06, | |
| "loss": 3.3767, | |
| "mean_token_accuracy": 0.4256245791912079, | |
| "num_tokens": 19227.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 3.4922634601593017, | |
| "epoch": 1.597938144329897, | |
| "grad_norm": 45.5, | |
| "learning_rate": 4.592754302301942e-06, | |
| "loss": 3.0169, | |
| "mean_token_accuracy": 0.4801509857177734, | |
| "num_tokens": 19582.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 3.3631627798080443, | |
| "epoch": 1.6237113402061856, | |
| "grad_norm": 74.0, | |
| "learning_rate": 4.572011427173263e-06, | |
| "loss": 3.2208, | |
| "mean_token_accuracy": 0.4472234547138214, | |
| "num_tokens": 19934.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 3.353942036628723, | |
| "epoch": 1.6494845360824741, | |
| "grad_norm": 54.25, | |
| "learning_rate": 4.550802627382756e-06, | |
| "loss": 3.2015, | |
| "mean_token_accuracy": 0.4314853399991989, | |
| "num_tokens": 20286.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 3.338911604881287, | |
| "epoch": 1.675257731958763, | |
| "grad_norm": 89.0, | |
| "learning_rate": 4.529132672065738e-06, | |
| "loss": 3.09, | |
| "mean_token_accuracy": 0.46670873165130616, | |
| "num_tokens": 20556.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 3.380303430557251, | |
| "epoch": 1.7010309278350515, | |
| "grad_norm": 76.0, | |
| "learning_rate": 4.507006434055663e-06, | |
| "loss": 3.2405, | |
| "mean_token_accuracy": 0.4376333147287369, | |
| "num_tokens": 20862.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 3.491442632675171, | |
| "epoch": 1.7268041237113403, | |
| "grad_norm": 50.25, | |
| "learning_rate": 4.484428888788395e-06, | |
| "loss": 3.4112, | |
| "mean_token_accuracy": 0.43711408972740173, | |
| "num_tokens": 21164.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 3.451434350013733, | |
| "epoch": 1.7525773195876289, | |
| "grad_norm": 105.5, | |
| "learning_rate": 4.461405113183396e-06, | |
| "loss": 3.0601, | |
| "mean_token_accuracy": 0.42957675755023955, | |
| "num_tokens": 21460.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 3.329232668876648, | |
| "epoch": 1.7783505154639174, | |
| "grad_norm": 63.25, | |
| "learning_rate": 4.437940284502105e-06, | |
| "loss": 3.1711, | |
| "mean_token_accuracy": 0.4769585371017456, | |
| "num_tokens": 21817.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 3.1766634225845336, | |
| "epoch": 1.8041237113402062, | |
| "grad_norm": 47.25, | |
| "learning_rate": 4.414039679183749e-06, | |
| "loss": 3.5047, | |
| "mean_token_accuracy": 0.43050127625465395, | |
| "num_tokens": 22163.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 3.394679617881775, | |
| "epoch": 1.829896907216495, | |
| "grad_norm": 64.0, | |
| "learning_rate": 4.389708671658844e-06, | |
| "loss": 3.0371, | |
| "mean_token_accuracy": 0.45515852570533755, | |
| "num_tokens": 22436.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 3.4196753025054933, | |
| "epoch": 1.8556701030927836, | |
| "grad_norm": 217.0, | |
| "learning_rate": 4.3649527331406796e-06, | |
| "loss": 3.612, | |
| "mean_token_accuracy": 0.4677813768386841, | |
| "num_tokens": 22709.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 3.4829108476638795, | |
| "epoch": 1.8814432989690721, | |
| "grad_norm": 60.25, | |
| "learning_rate": 4.339777430395022e-06, | |
| "loss": 3.3854, | |
| "mean_token_accuracy": 0.41964206099510193, | |
| "num_tokens": 23076.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 3.3097528219223022, | |
| "epoch": 1.9072164948453607, | |
| "grad_norm": 63.25, | |
| "learning_rate": 4.314188424488344e-06, | |
| "loss": 2.8523, | |
| "mean_token_accuracy": 0.48268236219882965, | |
| "num_tokens": 23403.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 3.3857040882110594, | |
| "epoch": 1.9329896907216495, | |
| "grad_norm": 65.5, | |
| "learning_rate": 4.288191469514839e-06, | |
| "loss": 2.9944, | |
| "mean_token_accuracy": 0.43578424155712125, | |
| "num_tokens": 23722.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 3.408646750450134, | |
| "epoch": 1.9587628865979383, | |
| "grad_norm": 68.5, | |
| "learning_rate": 4.261792411302525e-06, | |
| "loss": 3.4775, | |
| "mean_token_accuracy": 0.43519311845302583, | |
| "num_tokens": 23993.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 3.117679166793823, | |
| "epoch": 1.9845360824742269, | |
| "grad_norm": 57.25, | |
| "learning_rate": 4.234997186098716e-06, | |
| "loss": 2.829, | |
| "mean_token_accuracy": 0.43563964366912844, | |
| "num_tokens": 24324.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 3.5846817255020142, | |
| "epoch": 2.0103092783505154, | |
| "grad_norm": 41.5, | |
| "learning_rate": 4.207811819235164e-06, | |
| "loss": 3.374, | |
| "mean_token_accuracy": 0.4384703665971756, | |
| "num_tokens": 24690.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 3.0135667085647584, | |
| "epoch": 2.036082474226804, | |
| "grad_norm": 49.25, | |
| "learning_rate": 4.180242423773166e-06, | |
| "loss": 2.9591, | |
| "mean_token_accuracy": 0.5096089750528335, | |
| "num_tokens": 24959.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 3.289955735206604, | |
| "epoch": 2.0618556701030926, | |
| "grad_norm": 52.75, | |
| "learning_rate": 4.1522951991289465e-06, | |
| "loss": 3.218, | |
| "mean_token_accuracy": 0.39164299368858335, | |
| "num_tokens": 25285.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 3.3328727006912233, | |
| "epoch": 2.0876288659793816, | |
| "grad_norm": 58.25, | |
| "learning_rate": 4.1239764296796175e-06, | |
| "loss": 2.9698, | |
| "mean_token_accuracy": 0.4740764260292053, | |
| "num_tokens": 25578.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 3.3463461637496947, | |
| "epoch": 2.11340206185567, | |
| "grad_norm": 74.0, | |
| "learning_rate": 4.095292483350041e-06, | |
| "loss": 3.2668, | |
| "mean_token_accuracy": 0.41175087094306945, | |
| "num_tokens": 25954.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 3.347863268852234, | |
| "epoch": 2.1391752577319587, | |
| "grad_norm": 93.5, | |
| "learning_rate": 4.066249810180895e-06, | |
| "loss": 3.1272, | |
| "mean_token_accuracy": 0.44678041338920593, | |
| "num_tokens": 26225.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 3.2035139322280886, | |
| "epoch": 2.1649484536082473, | |
| "grad_norm": 69.0, | |
| "learning_rate": 4.036854940878284e-06, | |
| "loss": 3.2895, | |
| "mean_token_accuracy": 0.41500436663627627, | |
| "num_tokens": 26615.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 3.2597701787948608, | |
| "epoch": 2.1907216494845363, | |
| "grad_norm": 49.0, | |
| "learning_rate": 4.007114485345205e-06, | |
| "loss": 3.0027, | |
| "mean_token_accuracy": 0.529634228348732, | |
| "num_tokens": 26948.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 3.3507386445999146, | |
| "epoch": 2.216494845360825, | |
| "grad_norm": 54.0, | |
| "learning_rate": 3.977035131195202e-06, | |
| "loss": 3.0962, | |
| "mean_token_accuracy": 0.4740526854991913, | |
| "num_tokens": 27268.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 3.2031197786331176, | |
| "epoch": 2.2422680412371134, | |
| "grad_norm": 81.5, | |
| "learning_rate": 3.946623642248554e-06, | |
| "loss": 2.5841, | |
| "mean_token_accuracy": 0.5312987476587295, | |
| "num_tokens": 27549.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 3.2550023555755616, | |
| "epoch": 2.268041237113402, | |
| "grad_norm": 69.5, | |
| "learning_rate": 3.915886857011323e-06, | |
| "loss": 3.1698, | |
| "mean_token_accuracy": 0.4532750606536865, | |
| "num_tokens": 27886.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 3.485032653808594, | |
| "epoch": 2.2938144329896906, | |
| "grad_norm": 55.5, | |
| "learning_rate": 3.8848316871376055e-06, | |
| "loss": 3.4646, | |
| "mean_token_accuracy": 0.4280081331729889, | |
| "num_tokens": 28200.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 3.3659554958343505, | |
| "epoch": 2.319587628865979, | |
| "grad_norm": 97.5, | |
| "learning_rate": 3.853465115875335e-06, | |
| "loss": 3.2682, | |
| "mean_token_accuracy": 0.45062295794487, | |
| "num_tokens": 28536.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 3.13352746963501, | |
| "epoch": 2.345360824742268, | |
| "grad_norm": 80.0, | |
| "learning_rate": 3.821794196495995e-06, | |
| "loss": 2.9256, | |
| "mean_token_accuracy": 0.4756322205066681, | |
| "num_tokens": 28810.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 3.135547161102295, | |
| "epoch": 2.3711340206185567, | |
| "grad_norm": 65.5, | |
| "learning_rate": 3.7898260507085697e-06, | |
| "loss": 2.9811, | |
| "mean_token_accuracy": 0.4623017519712448, | |
| "num_tokens": 29089.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 3.449264574050903, | |
| "epoch": 2.3969072164948453, | |
| "grad_norm": 61.75, | |
| "learning_rate": 3.757567867058125e-06, | |
| "loss": 3.6536, | |
| "mean_token_accuracy": 0.40380783975124357, | |
| "num_tokens": 29403.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 3.4234744548797607, | |
| "epoch": 2.422680412371134, | |
| "grad_norm": 61.5, | |
| "learning_rate": 3.7250268993093396e-06, | |
| "loss": 3.2193, | |
| "mean_token_accuracy": 0.49216817915439603, | |
| "num_tokens": 29689.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 3.4770421504974367, | |
| "epoch": 2.448453608247423, | |
| "grad_norm": 44.0, | |
| "learning_rate": 3.6922104648153885e-06, | |
| "loss": 3.3501, | |
| "mean_token_accuracy": 0.4592595547437668, | |
| "num_tokens": 30047.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 3.339312434196472, | |
| "epoch": 2.4742268041237114, | |
| "grad_norm": 55.0, | |
| "learning_rate": 3.659125942872516e-06, | |
| "loss": 3.075, | |
| "mean_token_accuracy": 0.4719044387340546, | |
| "num_tokens": 30376.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 3.4688915252685546, | |
| "epoch": 2.5, | |
| "grad_norm": 61.75, | |
| "learning_rate": 3.625780773060687e-06, | |
| "loss": 3.1916, | |
| "mean_token_accuracy": 0.41194990575313567, | |
| "num_tokens": 30691.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 3.302336239814758, | |
| "epoch": 2.5257731958762886, | |
| "grad_norm": 67.5, | |
| "learning_rate": 3.5921824535706756e-06, | |
| "loss": 3.1441, | |
| "mean_token_accuracy": 0.4681535869836807, | |
| "num_tokens": 31007.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 3.343183135986328, | |
| "epoch": 2.551546391752577, | |
| "grad_norm": 71.0, | |
| "learning_rate": 3.5583385395179793e-06, | |
| "loss": 3.1333, | |
| "mean_token_accuracy": 0.47766990661621095, | |
| "num_tokens": 31272.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 3.13364782333374, | |
| "epoch": 2.5773195876288657, | |
| "grad_norm": 80.5, | |
| "learning_rate": 3.5242566412439332e-06, | |
| "loss": 3.1096, | |
| "mean_token_accuracy": 0.46111657917499543, | |
| "num_tokens": 31549.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 3.2046863794326783, | |
| "epoch": 2.6030927835051547, | |
| "grad_norm": 103.0, | |
| "learning_rate": 3.4899444226044023e-06, | |
| "loss": 3.5947, | |
| "mean_token_accuracy": 0.41809163987636566, | |
| "num_tokens": 31839.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 3.2269564390182497, | |
| "epoch": 2.6288659793814433, | |
| "grad_norm": 60.0, | |
| "learning_rate": 3.455409599246442e-06, | |
| "loss": 3.0418, | |
| "mean_token_accuracy": 0.4730542838573456, | |
| "num_tokens": 32131.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 3.255638074874878, | |
| "epoch": 2.654639175257732, | |
| "grad_norm": 53.25, | |
| "learning_rate": 3.4206599368733114e-06, | |
| "loss": 3.1486, | |
| "mean_token_accuracy": 0.48221423029899596, | |
| "num_tokens": 32472.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 3.304290223121643, | |
| "epoch": 2.680412371134021, | |
| "grad_norm": 51.5, | |
| "learning_rate": 3.3857032494982327e-06, | |
| "loss": 3.1768, | |
| "mean_token_accuracy": 0.44427731931209563, | |
| "num_tokens": 32882.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 3.4136642932891847, | |
| "epoch": 2.7061855670103094, | |
| "grad_norm": 53.25, | |
| "learning_rate": 3.3505473976872883e-06, | |
| "loss": 3.1119, | |
| "mean_token_accuracy": 0.45050349533557893, | |
| "num_tokens": 33165.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 3.386446309089661, | |
| "epoch": 2.731958762886598, | |
| "grad_norm": 71.0, | |
| "learning_rate": 3.3152002867918433e-06, | |
| "loss": 3.2298, | |
| "mean_token_accuracy": 0.41270871758461, | |
| "num_tokens": 33441.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 3.571580958366394, | |
| "epoch": 2.7577319587628866, | |
| "grad_norm": 59.5, | |
| "learning_rate": 3.279669865170906e-06, | |
| "loss": 3.3263, | |
| "mean_token_accuracy": 0.45758517682552335, | |
| "num_tokens": 33761.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 3.489269471168518, | |
| "epoch": 2.783505154639175, | |
| "grad_norm": 78.0, | |
| "learning_rate": 3.2439641224038093e-06, | |
| "loss": 3.0612, | |
| "mean_token_accuracy": 0.4716973781585693, | |
| "num_tokens": 34086.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 3.352162575721741, | |
| "epoch": 2.8092783505154637, | |
| "grad_norm": 56.0, | |
| "learning_rate": 3.20809108749363e-06, | |
| "loss": 3.1394, | |
| "mean_token_accuracy": 0.45618820786476133, | |
| "num_tokens": 34422.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 3.4569438695907593, | |
| "epoch": 2.8350515463917527, | |
| "grad_norm": 76.5, | |
| "learning_rate": 3.17205882706174e-06, | |
| "loss": 3.0988, | |
| "mean_token_accuracy": 0.48512459397315977, | |
| "num_tokens": 34726.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 3.376886796951294, | |
| "epoch": 2.8608247422680413, | |
| "grad_norm": 103.0, | |
| "learning_rate": 3.135875443533896e-06, | |
| "loss": 3.4308, | |
| "mean_token_accuracy": 0.41293925344944, | |
| "num_tokens": 35057.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 3.207944130897522, | |
| "epoch": 2.88659793814433, | |
| "grad_norm": 72.5, | |
| "learning_rate": 3.0995490733182825e-06, | |
| "loss": 2.7498, | |
| "mean_token_accuracy": 0.47106904685497286, | |
| "num_tokens": 35327.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 3.2526353359222413, | |
| "epoch": 2.9123711340206184, | |
| "grad_norm": 73.5, | |
| "learning_rate": 3.06308788497591e-06, | |
| "loss": 3.1798, | |
| "mean_token_accuracy": 0.4563720256090164, | |
| "num_tokens": 35617.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 3.3556177854537963, | |
| "epoch": 2.9381443298969074, | |
| "grad_norm": 67.0, | |
| "learning_rate": 3.026500077383785e-06, | |
| "loss": 3.1254, | |
| "mean_token_accuracy": 0.45203186869621276, | |
| "num_tokens": 35941.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 3.4933328866958617, | |
| "epoch": 2.963917525773196, | |
| "grad_norm": 65.5, | |
| "learning_rate": 2.989793877891263e-06, | |
| "loss": 3.5153, | |
| "mean_token_accuracy": 0.38476662933826444, | |
| "num_tokens": 36320.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 3.307176351547241, | |
| "epoch": 2.9896907216494846, | |
| "grad_norm": 80.5, | |
| "learning_rate": 2.952977540469999e-06, | |
| "loss": 3.0624, | |
| "mean_token_accuracy": 0.4267964720726013, | |
| "num_tokens": 36595.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 3.25522928237915, | |
| "epoch": 3.015463917525773, | |
| "grad_norm": 71.5, | |
| "learning_rate": 2.9160593438579054e-06, | |
| "loss": 2.8878, | |
| "mean_token_accuracy": 0.4728631258010864, | |
| "num_tokens": 36963.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 3.160351610183716, | |
| "epoch": 3.0412371134020617, | |
| "grad_norm": 73.5, | |
| "learning_rate": 2.879047589697555e-06, | |
| "loss": 3.0562, | |
| "mean_token_accuracy": 0.5218929082155228, | |
| "num_tokens": 37236.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 3.497001218795776, | |
| "epoch": 3.0670103092783507, | |
| "grad_norm": 91.0, | |
| "learning_rate": 2.8419506006694186e-06, | |
| "loss": 3.2947, | |
| "mean_token_accuracy": 0.40737067759037016, | |
| "num_tokens": 37522.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 3.3410105228424074, | |
| "epoch": 3.0927835051546393, | |
| "grad_norm": 61.5, | |
| "learning_rate": 2.8047767186203808e-06, | |
| "loss": 3.2214, | |
| "mean_token_accuracy": 0.45971156358718873, | |
| "num_tokens": 37825.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 3.4304397821426393, | |
| "epoch": 3.118556701030928, | |
| "grad_norm": 68.0, | |
| "learning_rate": 2.767534302687942e-06, | |
| "loss": 3.5544, | |
| "mean_token_accuracy": 0.42803180813789365, | |
| "num_tokens": 38181.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 3.2708693027496336, | |
| "epoch": 3.1443298969072164, | |
| "grad_norm": 51.5, | |
| "learning_rate": 2.730231727420533e-06, | |
| "loss": 3.0771, | |
| "mean_token_accuracy": 0.42246846556663514, | |
| "num_tokens": 38521.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 3.278656005859375, | |
| "epoch": 3.170103092783505, | |
| "grad_norm": 58.75, | |
| "learning_rate": 2.6928773808943696e-06, | |
| "loss": 3.2341, | |
| "mean_token_accuracy": 0.4356242328882217, | |
| "num_tokens": 38806.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 3.4282007455825805, | |
| "epoch": 3.195876288659794, | |
| "grad_norm": 85.5, | |
| "learning_rate": 2.6554796628272534e-06, | |
| "loss": 3.3791, | |
| "mean_token_accuracy": 0.44304960370063784, | |
| "num_tokens": 39104.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 3.3579065799713135, | |
| "epoch": 3.2216494845360826, | |
| "grad_norm": 68.5, | |
| "learning_rate": 2.6180469826897683e-06, | |
| "loss": 3.3142, | |
| "mean_token_accuracy": 0.41488229632377627, | |
| "num_tokens": 39433.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 3.2836158752441404, | |
| "epoch": 3.247422680412371, | |
| "grad_norm": 58.25, | |
| "learning_rate": 2.5805877578142713e-06, | |
| "loss": 3.0095, | |
| "mean_token_accuracy": 0.47964567244052886, | |
| "num_tokens": 39730.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 3.3673797607421876, | |
| "epoch": 3.2731958762886597, | |
| "grad_norm": 76.0, | |
| "learning_rate": 2.543110411502129e-06, | |
| "loss": 2.7924, | |
| "mean_token_accuracy": 0.46048580706119535, | |
| "num_tokens": 40029.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 3.2721627712249757, | |
| "epoch": 3.2989690721649483, | |
| "grad_norm": 64.5, | |
| "learning_rate": 2.5056233711295985e-06, | |
| "loss": 3.0903, | |
| "mean_token_accuracy": 0.4774274632334709, | |
| "num_tokens": 40310.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 3.1327761888504027, | |
| "epoch": 3.3247422680412373, | |
| "grad_norm": 78.0, | |
| "learning_rate": 2.4681350662528004e-06, | |
| "loss": 2.9106, | |
| "mean_token_accuracy": 0.45230706930160525, | |
| "num_tokens": 40609.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 3.2892409801483153, | |
| "epoch": 3.350515463917526, | |
| "grad_norm": 89.5, | |
| "learning_rate": 2.4306539267122e-06, | |
| "loss": 3.2275, | |
| "mean_token_accuracy": 0.42299139201641084, | |
| "num_tokens": 40899.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 3.4429001808166504, | |
| "epoch": 3.3762886597938144, | |
| "grad_norm": 55.5, | |
| "learning_rate": 2.393188380737021e-06, | |
| "loss": 3.1632, | |
| "mean_token_accuracy": 0.45537458956241605, | |
| "num_tokens": 41229.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 3.1667943239212035, | |
| "epoch": 3.402061855670103, | |
| "grad_norm": 61.25, | |
| "learning_rate": 2.3557468530500298e-06, | |
| "loss": 3.0767, | |
| "mean_token_accuracy": 0.41762115657329557, | |
| "num_tokens": 41595.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 2.933236765861511, | |
| "epoch": 3.4278350515463916, | |
| "grad_norm": 51.0, | |
| "learning_rate": 2.3183377629730963e-06, | |
| "loss": 2.5435, | |
| "mean_token_accuracy": 0.507614666223526, | |
| "num_tokens": 41919.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 3.458651614189148, | |
| "epoch": 3.4536082474226806, | |
| "grad_norm": 57.5, | |
| "learning_rate": 2.28096952253398e-06, | |
| "loss": 3.842, | |
| "mean_token_accuracy": 0.39763966798782346, | |
| "num_tokens": 42237.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 3.4523784637451174, | |
| "epoch": 3.479381443298969, | |
| "grad_norm": 43.5, | |
| "learning_rate": 2.2436505345747505e-06, | |
| "loss": 3.1589, | |
| "mean_token_accuracy": 0.4656664371490479, | |
| "num_tokens": 42575.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 3.239696431159973, | |
| "epoch": 3.5051546391752577, | |
| "grad_norm": 74.0, | |
| "learning_rate": 2.2063891908622767e-06, | |
| "loss": 2.7964, | |
| "mean_token_accuracy": 0.48300274908542634, | |
| "num_tokens": 42936.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 3.3155257225036623, | |
| "epoch": 3.5309278350515463, | |
| "grad_norm": 58.75, | |
| "learning_rate": 2.169193870201203e-06, | |
| "loss": 3.106, | |
| "mean_token_accuracy": 0.463299959897995, | |
| "num_tokens": 43256.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 3.298970675468445, | |
| "epoch": 3.556701030927835, | |
| "grad_norm": 99.5, | |
| "learning_rate": 2.1320729365498404e-06, | |
| "loss": 3.3008, | |
| "mean_token_accuracy": 0.4579169362783432, | |
| "num_tokens": 43519.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 3.373097324371338, | |
| "epoch": 3.582474226804124, | |
| "grad_norm": 61.0, | |
| "learning_rate": 2.095034737139404e-06, | |
| "loss": 3.068, | |
| "mean_token_accuracy": 0.4262126713991165, | |
| "num_tokens": 43814.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 3.1966411590576174, | |
| "epoch": 3.6082474226804124, | |
| "grad_norm": 70.5, | |
| "learning_rate": 2.058087600596997e-06, | |
| "loss": 2.8911, | |
| "mean_token_accuracy": 0.5069397330284119, | |
| "num_tokens": 44153.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 3.3376646757125856, | |
| "epoch": 3.634020618556701, | |
| "grad_norm": 64.5, | |
| "learning_rate": 2.021239835072794e-06, | |
| "loss": 3.0766, | |
| "mean_token_accuracy": 0.46726988852024076, | |
| "num_tokens": 44465.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 3.3123555183410645, | |
| "epoch": 3.6597938144329896, | |
| "grad_norm": 53.5, | |
| "learning_rate": 1.984499726371819e-06, | |
| "loss": 2.8527, | |
| "mean_token_accuracy": 0.5056732088327408, | |
| "num_tokens": 44792.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 3.2575713872909544, | |
| "epoch": 3.6855670103092786, | |
| "grad_norm": 92.0, | |
| "learning_rate": 1.947875536090748e-06, | |
| "loss": 3.0459, | |
| "mean_token_accuracy": 0.46606201231479644, | |
| "num_tokens": 45093.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 3.2526177883148195, | |
| "epoch": 3.711340206185567, | |
| "grad_norm": 56.0, | |
| "learning_rate": 1.9113754997601614e-06, | |
| "loss": 3.2644, | |
| "mean_token_accuracy": 0.4528957188129425, | |
| "num_tokens": 45381.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 3.166100788116455, | |
| "epoch": 3.7371134020618557, | |
| "grad_norm": 60.75, | |
| "learning_rate": 1.875007824992654e-06, | |
| "loss": 2.9971, | |
| "mean_token_accuracy": 0.4395914673805237, | |
| "num_tokens": 45723.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 3.41998028755188, | |
| "epoch": 3.7628865979381443, | |
| "grad_norm": 75.0, | |
| "learning_rate": 1.8387806896372206e-06, | |
| "loss": 3.2834, | |
| "mean_token_accuracy": 0.4282302588224411, | |
| "num_tokens": 46053.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 3.263835144042969, | |
| "epoch": 3.788659793814433, | |
| "grad_norm": 73.0, | |
| "learning_rate": 1.8027022399403377e-06, | |
| "loss": 3.1976, | |
| "mean_token_accuracy": 0.4730199307203293, | |
| "num_tokens": 46330.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 3.38959059715271, | |
| "epoch": 3.8144329896907214, | |
| "grad_norm": 86.5, | |
| "learning_rate": 1.7667805887141526e-06, | |
| "loss": 3.072, | |
| "mean_token_accuracy": 0.48471441566944123, | |
| "num_tokens": 46675.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 3.516207385063171, | |
| "epoch": 3.8402061855670104, | |
| "grad_norm": 69.5, | |
| "learning_rate": 1.731023813512186e-06, | |
| "loss": 3.1848, | |
| "mean_token_accuracy": 0.4615781009197235, | |
| "num_tokens": 47005.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 3.435403323173523, | |
| "epoch": 3.865979381443299, | |
| "grad_norm": 62.5, | |
| "learning_rate": 1.695439954812968e-06, | |
| "loss": 3.5288, | |
| "mean_token_accuracy": 0.4129458874464035, | |
| "num_tokens": 47320.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 3.0035399436950683, | |
| "epoch": 3.8917525773195876, | |
| "grad_norm": 87.5, | |
| "learning_rate": 1.660037014212009e-06, | |
| "loss": 2.8988, | |
| "mean_token_accuracy": 0.5202886313199997, | |
| "num_tokens": 47654.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 3.282400608062744, | |
| "epoch": 3.917525773195876, | |
| "grad_norm": 68.0, | |
| "learning_rate": 1.62482295262251e-06, | |
| "loss": 3.0969, | |
| "mean_token_accuracy": 0.42603813409805297, | |
| "num_tokens": 47988.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 3.5067455053329466, | |
| "epoch": 3.943298969072165, | |
| "grad_norm": 63.75, | |
| "learning_rate": 1.589805688485231e-06, | |
| "loss": 3.2709, | |
| "mean_token_accuracy": 0.4377409517765045, | |
| "num_tokens": 48303.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 3.245515561103821, | |
| "epoch": 3.9690721649484537, | |
| "grad_norm": 55.0, | |
| "learning_rate": 1.5549930959878996e-06, | |
| "loss": 2.9502, | |
| "mean_token_accuracy": 0.5143730461597442, | |
| "num_tokens": 48637.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 3.1619110345840453, | |
| "epoch": 3.9948453608247423, | |
| "grad_norm": 61.25, | |
| "learning_rate": 1.5203930032945765e-06, | |
| "loss": 2.883, | |
| "mean_token_accuracy": 0.4930521368980408, | |
| "num_tokens": 48955.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 3.368650484085083, | |
| "epoch": 4.020618556701031, | |
| "grad_norm": 59.75, | |
| "learning_rate": 1.4860131907853664e-06, | |
| "loss": 3.0012, | |
| "mean_token_accuracy": 0.4367584019899368, | |
| "num_tokens": 49264.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 3.2799517631530763, | |
| "epoch": 4.046391752577319, | |
| "grad_norm": 107.0, | |
| "learning_rate": 1.45186138930688e-06, | |
| "loss": 3.0185, | |
| "mean_token_accuracy": 0.45549334287643434, | |
| "num_tokens": 49528.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 3.293484115600586, | |
| "epoch": 4.072164948453608, | |
| "grad_norm": 70.0, | |
| "learning_rate": 1.4179452784338265e-06, | |
| "loss": 3.3586, | |
| "mean_token_accuracy": 0.41464213728904725, | |
| "num_tokens": 49823.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 3.212877941131592, | |
| "epoch": 4.097938144329897, | |
| "grad_norm": 58.0, | |
| "learning_rate": 1.3842724847421435e-06, | |
| "loss": 2.9933, | |
| "mean_token_accuracy": 0.41281671822071075, | |
| "num_tokens": 50209.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 3.2077365159988402, | |
| "epoch": 4.123711340206185, | |
| "grad_norm": 60.0, | |
| "learning_rate": 1.3508505800940327e-06, | |
| "loss": 3.2048, | |
| "mean_token_accuracy": 0.45703212916851044, | |
| "num_tokens": 50566.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 3.2609221935272217, | |
| "epoch": 4.149484536082475, | |
| "grad_norm": 87.5, | |
| "learning_rate": 1.317687079935317e-06, | |
| "loss": 3.2903, | |
| "mean_token_accuracy": 0.45516538321971894, | |
| "num_tokens": 50932.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 3.199517750740051, | |
| "epoch": 4.175257731958763, | |
| "grad_norm": 121.5, | |
| "learning_rate": 1.2847894416054645e-06, | |
| "loss": 2.9641, | |
| "mean_token_accuracy": 0.49091013371944425, | |
| "num_tokens": 51242.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 3.2298731803894043, | |
| "epoch": 4.201030927835052, | |
| "grad_norm": 62.75, | |
| "learning_rate": 1.2521650626606926e-06, | |
| "loss": 2.9035, | |
| "mean_token_accuracy": 0.5309109538793564, | |
| "num_tokens": 51519.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 3.3670015573501586, | |
| "epoch": 4.22680412371134, | |
| "grad_norm": 80.0, | |
| "learning_rate": 1.219821279210507e-06, | |
| "loss": 3.0805, | |
| "mean_token_accuracy": 0.444789519906044, | |
| "num_tokens": 51823.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 3.3619128465652466, | |
| "epoch": 4.252577319587629, | |
| "grad_norm": 89.5, | |
| "learning_rate": 1.1877653642680618e-06, | |
| "loss": 3.0084, | |
| "mean_token_accuracy": 0.45184328258037565, | |
| "num_tokens": 52106.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 3.4395506143569947, | |
| "epoch": 4.278350515463917, | |
| "grad_norm": 72.0, | |
| "learning_rate": 1.1560045261147079e-06, | |
| "loss": 3.4761, | |
| "mean_token_accuracy": 0.41549868881702423, | |
| "num_tokens": 52494.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 3.154675102233887, | |
| "epoch": 4.304123711340206, | |
| "grad_norm": 62.25, | |
| "learning_rate": 1.1245459066790962e-06, | |
| "loss": 2.9656, | |
| "mean_token_accuracy": 0.49724196195602416, | |
| "num_tokens": 52842.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 3.107476019859314, | |
| "epoch": 4.329896907216495, | |
| "grad_norm": 87.5, | |
| "learning_rate": 1.0933965799312015e-06, | |
| "loss": 2.8218, | |
| "mean_token_accuracy": 0.5045855909585952, | |
| "num_tokens": 53174.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 3.290882110595703, | |
| "epoch": 4.355670103092783, | |
| "grad_norm": 87.0, | |
| "learning_rate": 1.062563550291626e-06, | |
| "loss": 3.0095, | |
| "mean_token_accuracy": 0.48295737206935885, | |
| "num_tokens": 53410.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 3.23433403968811, | |
| "epoch": 4.381443298969073, | |
| "grad_norm": 57.5, | |
| "learning_rate": 1.0320537510565474e-06, | |
| "loss": 3.0136, | |
| "mean_token_accuracy": 0.4336048990488052, | |
| "num_tokens": 53745.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 3.3636296510696413, | |
| "epoch": 4.407216494845361, | |
| "grad_norm": 60.5, | |
| "learning_rate": 1.0018740428386562e-06, | |
| "loss": 3.0679, | |
| "mean_token_accuracy": 0.47437537312507627, | |
| "num_tokens": 54031.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 3.443432426452637, | |
| "epoch": 4.43298969072165, | |
| "grad_norm": 86.5, | |
| "learning_rate": 9.720312120244368e-07, | |
| "loss": 3.2067, | |
| "mean_token_accuracy": 0.4474021762609482, | |
| "num_tokens": 54356.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 3.4575706720352173, | |
| "epoch": 4.458762886597938, | |
| "grad_norm": 80.0, | |
| "learning_rate": 9.425319692481421e-07, | |
| "loss": 3.244, | |
| "mean_token_accuracy": 0.44102594554424285, | |
| "num_tokens": 54627.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 3.0440264463424684, | |
| "epoch": 4.484536082474227, | |
| "grad_norm": 63.0, | |
| "learning_rate": 9.133829478828e-07, | |
| "loss": 2.9248, | |
| "mean_token_accuracy": 0.5230745673179626, | |
| "num_tokens": 54941.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 3.0223827600479125, | |
| "epoch": 4.510309278350515, | |
| "grad_norm": 77.5, | |
| "learning_rate": 8.845907025485945e-07, | |
| "loss": 2.8445, | |
| "mean_token_accuracy": 0.5213218927383423, | |
| "num_tokens": 55227.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 3.4792301177978517, | |
| "epoch": 4.536082474226804, | |
| "grad_norm": 63.0, | |
| "learning_rate": 8.561617076389556e-07, | |
| "loss": 3.1867, | |
| "mean_token_accuracy": 0.4527750164270401, | |
| "num_tokens": 55547.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 3.3560336589813233, | |
| "epoch": 4.561855670103093, | |
| "grad_norm": 85.0, | |
| "learning_rate": 8.281023558646892e-07, | |
| "loss": 3.2218, | |
| "mean_token_accuracy": 0.4219012975692749, | |
| "num_tokens": 55880.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 3.2443851470947265, | |
| "epoch": 4.587628865979381, | |
| "grad_norm": 72.0, | |
| "learning_rate": 8.004189568164721e-07, | |
| "loss": 3.2558, | |
| "mean_token_accuracy": 0.4505341827869415, | |
| "num_tokens": 56225.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 3.254337120056152, | |
| "epoch": 4.61340206185567, | |
| "grad_norm": 75.5, | |
| "learning_rate": 7.731177355460456e-07, | |
| "loss": 3.1863, | |
| "mean_token_accuracy": 0.4375568628311157, | |
| "num_tokens": 56528.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 3.0034953951835632, | |
| "epoch": 4.639175257731958, | |
| "grad_norm": 64.0, | |
| "learning_rate": 7.462048311664086e-07, | |
| "loss": 2.6958, | |
| "mean_token_accuracy": 0.536104878783226, | |
| "num_tokens": 56847.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 3.2068035364151, | |
| "epoch": 4.664948453608248, | |
| "grad_norm": 85.0, | |
| "learning_rate": 7.196862954713438e-07, | |
| "loss": 2.9987, | |
| "mean_token_accuracy": 0.4882556527853012, | |
| "num_tokens": 57154.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 3.0603734254837036, | |
| "epoch": 4.690721649484536, | |
| "grad_norm": 54.5, | |
| "learning_rate": 6.935680915745743e-07, | |
| "loss": 2.6614, | |
| "mean_token_accuracy": 0.4894832164049149, | |
| "num_tokens": 57445.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 3.4436551094055177, | |
| "epoch": 4.716494845360825, | |
| "grad_norm": 66.0, | |
| "learning_rate": 6.678560925688629e-07, | |
| "loss": 3.4491, | |
| "mean_token_accuracy": 0.42854584753513336, | |
| "num_tokens": 57799.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 3.3284261226654053, | |
| "epoch": 4.742268041237113, | |
| "grad_norm": 69.5, | |
| "learning_rate": 6.425560802053551e-07, | |
| "loss": 3.1212, | |
| "mean_token_accuracy": 0.4246213287115097, | |
| "num_tokens": 58153.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 3.3664702415466308, | |
| "epoch": 4.768041237113402, | |
| "grad_norm": 85.5, | |
| "learning_rate": 6.176737435934593e-07, | |
| "loss": 2.97, | |
| "mean_token_accuracy": 0.45907922089099884, | |
| "num_tokens": 58473.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 3.260459637641907, | |
| "epoch": 4.793814432989691, | |
| "grad_norm": 70.0, | |
| "learning_rate": 5.932146779215614e-07, | |
| "loss": 3.091, | |
| "mean_token_accuracy": 0.47388018369674684, | |
| "num_tokens": 58787.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 3.260399317741394, | |
| "epoch": 4.819587628865979, | |
| "grad_norm": 64.0, | |
| "learning_rate": 5.691843831988547e-07, | |
| "loss": 3.1014, | |
| "mean_token_accuracy": 0.4497444421052933, | |
| "num_tokens": 59126.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 3.1505658864974975, | |
| "epoch": 4.845360824742268, | |
| "grad_norm": 101.5, | |
| "learning_rate": 5.45588263018581e-07, | |
| "loss": 3.0726, | |
| "mean_token_accuracy": 0.4976365238428116, | |
| "num_tokens": 59480.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 3.3082144498825072, | |
| "epoch": 4.871134020618557, | |
| "grad_norm": 55.25, | |
| "learning_rate": 5.224316233429422e-07, | |
| "loss": 3.4489, | |
| "mean_token_accuracy": 0.41699815094470977, | |
| "num_tokens": 59755.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 3.4492565393447876, | |
| "epoch": 4.896907216494846, | |
| "grad_norm": 129.0, | |
| "learning_rate": 4.997196713099728e-07, | |
| "loss": 3.3794, | |
| "mean_token_accuracy": 0.44576013684272764, | |
| "num_tokens": 60046.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 3.398744559288025, | |
| "epoch": 4.922680412371134, | |
| "grad_norm": 88.0, | |
| "learning_rate": 4.774575140626317e-07, | |
| "loss": 3.3877, | |
| "mean_token_accuracy": 0.4413867056369781, | |
| "num_tokens": 60384.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 3.20350604057312, | |
| "epoch": 4.948453608247423, | |
| "grad_norm": 56.0, | |
| "learning_rate": 4.556501576003791e-07, | |
| "loss": 3.0359, | |
| "mean_token_accuracy": 0.45435314774513247, | |
| "num_tokens": 60655.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 2.980875849723816, | |
| "epoch": 4.974226804123711, | |
| "grad_norm": 80.5, | |
| "learning_rate": 4.343025056534994e-07, | |
| "loss": 2.9326, | |
| "mean_token_accuracy": 0.5152811527252197, | |
| "num_tokens": 60949.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 3.3593399286270142, | |
| "epoch": 5.0, | |
| "grad_norm": 70.5, | |
| "learning_rate": 4.134193585804197e-07, | |
| "loss": 3.2147, | |
| "mean_token_accuracy": 0.4529120117425919, | |
| "num_tokens": 61260.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 3.3000378370285035, | |
| "epoch": 5.025773195876289, | |
| "grad_norm": 75.0, | |
| "learning_rate": 3.930054122882709e-07, | |
| "loss": 3.2406, | |
| "mean_token_accuracy": 0.4164623826742172, | |
| "num_tokens": 61558.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 3.211987781524658, | |
| "epoch": 5.051546391752577, | |
| "grad_norm": 56.75, | |
| "learning_rate": 3.7306525717694195e-07, | |
| "loss": 3.1386, | |
| "mean_token_accuracy": 0.5111187756061554, | |
| "num_tokens": 61848.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 3.2421340942382812, | |
| "epoch": 5.077319587628866, | |
| "grad_norm": 79.5, | |
| "learning_rate": 3.536033771068506e-07, | |
| "loss": 2.9573, | |
| "mean_token_accuracy": 0.49586873054504393, | |
| "num_tokens": 62126.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 3.501763868331909, | |
| "epoch": 5.103092783505154, | |
| "grad_norm": 67.5, | |
| "learning_rate": 3.3462414839068233e-07, | |
| "loss": 3.2128, | |
| "mean_token_accuracy": 0.4240656703710556, | |
| "num_tokens": 62529.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 3.2703943729400633, | |
| "epoch": 5.128865979381443, | |
| "grad_norm": 57.5, | |
| "learning_rate": 3.1613183880930124e-07, | |
| "loss": 3.1511, | |
| "mean_token_accuracy": 0.4509860217571259, | |
| "num_tokens": 62808.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 3.1918478488922117, | |
| "epoch": 5.154639175257732, | |
| "grad_norm": 90.0, | |
| "learning_rate": 2.9813060665207615e-07, | |
| "loss": 2.9956, | |
| "mean_token_accuracy": 0.5052088230848313, | |
| "num_tokens": 63054.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 3.2867510318756104, | |
| "epoch": 5.180412371134021, | |
| "grad_norm": 62.0, | |
| "learning_rate": 2.8062449978182056e-07, | |
| "loss": 3.2784, | |
| "mean_token_accuracy": 0.4433626294136047, | |
| "num_tokens": 63378.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 3.276996612548828, | |
| "epoch": 5.206185567010309, | |
| "grad_norm": 59.5, | |
| "learning_rate": 2.636174547245671e-07, | |
| "loss": 2.9721, | |
| "mean_token_accuracy": 0.4633091241121292, | |
| "num_tokens": 63698.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 3.3924339771270753, | |
| "epoch": 5.231958762886598, | |
| "grad_norm": 70.5, | |
| "learning_rate": 2.471132957843775e-07, | |
| "loss": 3.5247, | |
| "mean_token_accuracy": 0.40621828436851504, | |
| "num_tokens": 64026.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 3.183038115501404, | |
| "epoch": 5.257731958762887, | |
| "grad_norm": 64.0, | |
| "learning_rate": 2.3111573418338724e-07, | |
| "loss": 3.057, | |
| "mean_token_accuracy": 0.4755241394042969, | |
| "num_tokens": 64357.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 3.0321537017822267, | |
| "epoch": 5.283505154639175, | |
| "grad_norm": 80.0, | |
| "learning_rate": 2.156283672272777e-07, | |
| "loss": 3.0536, | |
| "mean_token_accuracy": 0.46814982295036317, | |
| "num_tokens": 64717.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 3.307017612457275, | |
| "epoch": 5.309278350515464, | |
| "grad_norm": 50.5, | |
| "learning_rate": 2.0065467749636497e-07, | |
| "loss": 3.0379, | |
| "mean_token_accuracy": 0.4797358334064484, | |
| "num_tokens": 65004.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 3.342577505111694, | |
| "epoch": 5.335051546391752, | |
| "grad_norm": 103.5, | |
| "learning_rate": 1.861980320624873e-07, | |
| "loss": 3.111, | |
| "mean_token_accuracy": 0.46001616716384885, | |
| "num_tokens": 65327.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 3.2767983198165895, | |
| "epoch": 5.360824742268041, | |
| "grad_norm": 51.25, | |
| "learning_rate": 1.7226168173186396e-07, | |
| "loss": 3.2626, | |
| "mean_token_accuracy": 0.44220549464225767, | |
| "num_tokens": 65743.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 3.1632169485092163, | |
| "epoch": 5.38659793814433, | |
| "grad_norm": 80.0, | |
| "learning_rate": 1.5884876031410202e-07, | |
| "loss": 3.4269, | |
| "mean_token_accuracy": 0.43744454979896547, | |
| "num_tokens": 66042.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 3.083737111091614, | |
| "epoch": 5.412371134020619, | |
| "grad_norm": 76.0, | |
| "learning_rate": 1.4596228391750676e-07, | |
| "loss": 3.0883, | |
| "mean_token_accuracy": 0.4829647660255432, | |
| "num_tokens": 66327.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 3.032327151298523, | |
| "epoch": 5.438144329896907, | |
| "grad_norm": 99.5, | |
| "learning_rate": 1.3360515027086462e-07, | |
| "loss": 2.8775, | |
| "mean_token_accuracy": 0.4383985221385956, | |
| "num_tokens": 66641.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 3.05249879360199, | |
| "epoch": 5.463917525773196, | |
| "grad_norm": 87.0, | |
| "learning_rate": 1.2178013807184085e-07, | |
| "loss": 3.0836, | |
| "mean_token_accuracy": 0.457595694065094, | |
| "num_tokens": 66937.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 3.3152157068252563, | |
| "epoch": 5.489690721649485, | |
| "grad_norm": 76.0, | |
| "learning_rate": 1.1048990636214618e-07, | |
| "loss": 3.1917, | |
| "mean_token_accuracy": 0.48176924884319305, | |
| "num_tokens": 67223.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 3.5079283475875855, | |
| "epoch": 5.515463917525773, | |
| "grad_norm": 60.75, | |
| "learning_rate": 9.973699392960917e-08, | |
| "loss": 3.3596, | |
| "mean_token_accuracy": 0.40318597555160524, | |
| "num_tokens": 67536.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 3.342143940925598, | |
| "epoch": 5.541237113402062, | |
| "grad_norm": 65.5, | |
| "learning_rate": 8.952381873728821e-08, | |
| "loss": 2.9796, | |
| "mean_token_accuracy": 0.46548409163951876, | |
| "num_tokens": 67876.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 3.1259532451629637, | |
| "epoch": 5.56701030927835, | |
| "grad_norm": 49.75, | |
| "learning_rate": 7.985267737975588e-08, | |
| "loss": 3.0859, | |
| "mean_token_accuracy": 0.4911260426044464, | |
| "num_tokens": 68229.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 3.450204849243164, | |
| "epoch": 5.592783505154639, | |
| "grad_norm": 71.5, | |
| "learning_rate": 7.072574456667258e-08, | |
| "loss": 3.0189, | |
| "mean_token_accuracy": 0.47769983410835265, | |
| "num_tokens": 68567.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 3.2766273736953737, | |
| "epoch": 5.618556701030927, | |
| "grad_norm": 83.5, | |
| "learning_rate": 6.214507263376801e-08, | |
| "loss": 2.881, | |
| "mean_token_accuracy": 0.48972640335559847, | |
| "num_tokens": 68846.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 3.267438220977783, | |
| "epoch": 5.644329896907217, | |
| "grad_norm": 68.0, | |
| "learning_rate": 5.411259108134115e-08, | |
| "loss": 2.9084, | |
| "mean_token_accuracy": 0.4877107352018356, | |
| "num_tokens": 69197.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 3.320011782646179, | |
| "epoch": 5.670103092783505, | |
| "grad_norm": 70.0, | |
| "learning_rate": 4.663010614038205e-08, | |
| "loss": 3.1586, | |
| "mean_token_accuracy": 0.4482086032629013, | |
| "num_tokens": 69493.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 3.4605057001113892, | |
| "epoch": 5.695876288659794, | |
| "grad_norm": 86.0, | |
| "learning_rate": 3.9699300366410895e-08, | |
| "loss": 3.2727, | |
| "mean_token_accuracy": 0.4599607527256012, | |
| "num_tokens": 69832.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 3.275946855545044, | |
| "epoch": 5.721649484536083, | |
| "grad_norm": 58.5, | |
| "learning_rate": 3.332173226113067e-08, | |
| "loss": 3.0561, | |
| "mean_token_accuracy": 0.460858029127121, | |
| "num_tokens": 70215.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 3.389675521850586, | |
| "epoch": 5.747422680412371, | |
| "grad_norm": 92.0, | |
| "learning_rate": 2.7498835921971058e-08, | |
| "loss": 3.0778, | |
| "mean_token_accuracy": 0.44814312756061553, | |
| "num_tokens": 70534.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 3.1975868225097654, | |
| "epoch": 5.77319587628866, | |
| "grad_norm": 72.5, | |
| "learning_rate": 2.2231920719610057e-08, | |
| "loss": 2.9487, | |
| "mean_token_accuracy": 0.47108798921108247, | |
| "num_tokens": 70838.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 3.20022566318512, | |
| "epoch": 5.798969072164948, | |
| "grad_norm": 69.0, | |
| "learning_rate": 1.75221710035417e-08, | |
| "loss": 2.67, | |
| "mean_token_accuracy": 0.5278927952051162, | |
| "num_tokens": 71133.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 3.337849473953247, | |
| "epoch": 5.824742268041237, | |
| "grad_norm": 93.5, | |
| "learning_rate": 1.3370645835754926e-08, | |
| "loss": 3.1954, | |
| "mean_token_accuracy": 0.4186277240514755, | |
| "num_tokens": 71475.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 3.2620397567749024, | |
| "epoch": 5.850515463917525, | |
| "grad_norm": 99.0, | |
| "learning_rate": 9.77827875258769e-09, | |
| "loss": 3.0062, | |
| "mean_token_accuracy": 0.4634517639875412, | |
| "num_tokens": 71766.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 3.139668273925781, | |
| "epoch": 5.876288659793815, | |
| "grad_norm": 84.0, | |
| "learning_rate": 6.745877554806268e-09, | |
| "loss": 3.0198, | |
| "mean_token_accuracy": 0.4661685138940811, | |
| "num_tokens": 72053.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 3.293707489967346, | |
| "epoch": 5.902061855670103, | |
| "grad_norm": 59.0, | |
| "learning_rate": 4.274124125958068e-09, | |
| "loss": 3.0505, | |
| "mean_token_accuracy": 0.46237342357635497, | |
| "num_tokens": 72347.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 3.4691567182540894, | |
| "epoch": 5.927835051546392, | |
| "grad_norm": 59.5, | |
| "learning_rate": 2.363574279040104e-09, | |
| "loss": 3.3964, | |
| "mean_token_accuracy": 0.4291213423013687, | |
| "num_tokens": 72681.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 3.170962166786194, | |
| "epoch": 5.953608247422681, | |
| "grad_norm": 81.5, | |
| "learning_rate": 1.0146576315145374e-09, | |
| "loss": 2.9427, | |
| "mean_token_accuracy": 0.5170234173536301, | |
| "num_tokens": 72977.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 3.2913349628448487, | |
| "epoch": 5.979381443298969, | |
| "grad_norm": 67.0, | |
| "learning_rate": 2.276775087031724e-10, | |
| "loss": 3.0982, | |
| "mean_token_accuracy": 0.40257313251495364, | |
| "num_tokens": 73292.0, | |
| "step": 2320 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2328, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 621651308691456.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |