| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 939, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 2.771598082780838, |
| "epoch": 0.032, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 6.206896551724138e-05, |
| "loss": 2.916146087646484, |
| "mean_token_accuracy": 0.45416649207472803, |
| "num_tokens": 55650.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 2.3006236433982847, |
| "epoch": 0.064, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.00013103448275862068, |
| "loss": 2.4864336013793946, |
| "mean_token_accuracy": 0.5021730229258538, |
| "num_tokens": 110943.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.8387477725744248, |
| "epoch": 0.096, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.0002, |
| "loss": 1.766120147705078, |
| "mean_token_accuracy": 0.6066308304667473, |
| "num_tokens": 166476.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.2004003927111626, |
| "epoch": 0.128, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.0001978021978021978, |
| "loss": 1.1438531875610352, |
| "mean_token_accuracy": 0.7330661401152611, |
| "num_tokens": 221809.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.8197565108537674, |
| "epoch": 0.16, |
| "grad_norm": 0.34765625, |
| "learning_rate": 0.00019560439560439562, |
| "loss": 0.7494671821594239, |
| "mean_token_accuracy": 0.8239153817296028, |
| "num_tokens": 277841.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.5685673624277114, |
| "epoch": 0.192, |
| "grad_norm": 0.369140625, |
| "learning_rate": 0.00019340659340659342, |
| "loss": 0.49509191513061523, |
| "mean_token_accuracy": 0.8886354997754097, |
| "num_tokens": 333258.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.36241610124707224, |
| "epoch": 0.224, |
| "grad_norm": 0.306640625, |
| "learning_rate": 0.00019120879120879122, |
| "loss": 0.29932661056518556, |
| "mean_token_accuracy": 0.9331418961286545, |
| "num_tokens": 388437.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.2608797810971737, |
| "epoch": 0.256, |
| "grad_norm": 0.322265625, |
| "learning_rate": 0.00018901098901098903, |
| "loss": 0.20479197502136232, |
| "mean_token_accuracy": 0.9527293920516968, |
| "num_tokens": 444483.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.20162589177489282, |
| "epoch": 0.288, |
| "grad_norm": 0.1904296875, |
| "learning_rate": 0.00018681318681318683, |
| "loss": 0.16634706258773804, |
| "mean_token_accuracy": 0.9603863671422005, |
| "num_tokens": 500408.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.1615206029266119, |
| "epoch": 0.32, |
| "grad_norm": 0.1904296875, |
| "learning_rate": 0.00018461538461538463, |
| "loss": 0.14311870336532592, |
| "mean_token_accuracy": 0.9628557220101357, |
| "num_tokens": 556058.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.1455086786299944, |
| "epoch": 0.352, |
| "grad_norm": 0.1796875, |
| "learning_rate": 0.0001824175824175824, |
| "loss": 0.1271807074546814, |
| "mean_token_accuracy": 0.9651171401143074, |
| "num_tokens": 611407.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.13460372295230627, |
| "epoch": 0.384, |
| "grad_norm": 0.1572265625, |
| "learning_rate": 0.00018021978021978024, |
| "loss": 0.11452269554138184, |
| "mean_token_accuracy": 0.9668730065226555, |
| "num_tokens": 667323.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.126934945769608, |
| "epoch": 0.416, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 0.00017802197802197802, |
| "loss": 0.10631006956100464, |
| "mean_token_accuracy": 0.9682586997747421, |
| "num_tokens": 723233.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.12308279145509005, |
| "epoch": 0.448, |
| "grad_norm": 0.1806640625, |
| "learning_rate": 0.00017582417582417582, |
| "loss": 0.09787563681602478, |
| "mean_token_accuracy": 0.9693531423807145, |
| "num_tokens": 779623.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.11310338769108057, |
| "epoch": 0.48, |
| "grad_norm": 0.1552734375, |
| "learning_rate": 0.00017362637362637365, |
| "loss": 0.09501280188560486, |
| "mean_token_accuracy": 0.9697160139679909, |
| "num_tokens": 835343.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.10735327322036028, |
| "epoch": 0.512, |
| "grad_norm": 0.12109375, |
| "learning_rate": 0.00017142857142857143, |
| "loss": 0.0945986807346344, |
| "mean_token_accuracy": 0.9699150815606117, |
| "num_tokens": 890526.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.10488443765789271, |
| "epoch": 0.544, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 0.00016923076923076923, |
| "loss": 0.09071503281593322, |
| "mean_token_accuracy": 0.9702877476811409, |
| "num_tokens": 946551.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.09990130253136158, |
| "epoch": 0.576, |
| "grad_norm": 0.10986328125, |
| "learning_rate": 0.00016703296703296706, |
| "loss": 0.08788512349128723, |
| "mean_token_accuracy": 0.9700309321284294, |
| "num_tokens": 1002250.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.1006152719259262, |
| "epoch": 0.608, |
| "grad_norm": 0.138671875, |
| "learning_rate": 0.00016483516483516484, |
| "loss": 0.08786565065383911, |
| "mean_token_accuracy": 0.9703836083412171, |
| "num_tokens": 1057942.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.09826808385550975, |
| "epoch": 0.64, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 0.00016263736263736264, |
| "loss": 0.08827171325683594, |
| "mean_token_accuracy": 0.9702799677848816, |
| "num_tokens": 1113313.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.09519640635699034, |
| "epoch": 0.672, |
| "grad_norm": 0.109375, |
| "learning_rate": 0.00016043956043956044, |
| "loss": 0.08511611819267273, |
| "mean_token_accuracy": 0.9707273244857788, |
| "num_tokens": 1168866.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.09626698959618807, |
| "epoch": 0.704, |
| "grad_norm": 0.07958984375, |
| "learning_rate": 0.00015824175824175824, |
| "loss": 0.08481809496879578, |
| "mean_token_accuracy": 0.971166367828846, |
| "num_tokens": 1224112.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.08999720010906458, |
| "epoch": 0.736, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.00015604395604395605, |
| "loss": 0.08373072743415833, |
| "mean_token_accuracy": 0.9710352584719658, |
| "num_tokens": 1279497.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.0909602127969265, |
| "epoch": 0.768, |
| "grad_norm": 0.08251953125, |
| "learning_rate": 0.00015384615384615385, |
| "loss": 0.0811634123325348, |
| "mean_token_accuracy": 0.9713989913463592, |
| "num_tokens": 1335147.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.08677519466727972, |
| "epoch": 0.8, |
| "grad_norm": 0.0791015625, |
| "learning_rate": 0.00015164835164835165, |
| "loss": 0.08006779551506042, |
| "mean_token_accuracy": 0.9713309288024903, |
| "num_tokens": 1390441.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.08524699918925763, |
| "epoch": 0.832, |
| "grad_norm": 0.1142578125, |
| "learning_rate": 0.00014945054945054946, |
| "loss": 0.07839072942733764, |
| "mean_token_accuracy": 0.9715656459331512, |
| "num_tokens": 1446334.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.08736930266022683, |
| "epoch": 0.864, |
| "grad_norm": 0.10791015625, |
| "learning_rate": 0.00014725274725274726, |
| "loss": 0.07965806126594543, |
| "mean_token_accuracy": 0.971031291782856, |
| "num_tokens": 1502214.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.08625259138643741, |
| "epoch": 0.896, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.00014505494505494506, |
| "loss": 0.0801069438457489, |
| "mean_token_accuracy": 0.9713141709566117, |
| "num_tokens": 1557731.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.08634743597358466, |
| "epoch": 0.928, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.00014285714285714287, |
| "loss": 0.07907066345214844, |
| "mean_token_accuracy": 0.9716951295733451, |
| "num_tokens": 1613126.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.08533936887979507, |
| "epoch": 0.96, |
| "grad_norm": 0.06103515625, |
| "learning_rate": 0.00014065934065934067, |
| "loss": 0.07907315492630004, |
| "mean_token_accuracy": 0.9721988439559937, |
| "num_tokens": 1668452.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.08410668671131134, |
| "epoch": 0.992, |
| "grad_norm": 0.0791015625, |
| "learning_rate": 0.00013846153846153847, |
| "loss": 0.07861064672470093, |
| "mean_token_accuracy": 0.971791522204876, |
| "num_tokens": 1724177.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.08109197275418985, |
| "epoch": 1.0224, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 0.00013626373626373628, |
| "loss": 0.07589302062988282, |
| "mean_token_accuracy": 0.9723429350476516, |
| "num_tokens": 1777420.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.0814886923879385, |
| "epoch": 1.0544, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 0.00013406593406593405, |
| "loss": 0.07642998099327088, |
| "mean_token_accuracy": 0.97195183634758, |
| "num_tokens": 1833060.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.08034903313964606, |
| "epoch": 1.0864, |
| "grad_norm": 0.07373046875, |
| "learning_rate": 0.00013186813186813188, |
| "loss": 0.07447389960289001, |
| "mean_token_accuracy": 0.9732914686203002, |
| "num_tokens": 1889075.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.07864065244793891, |
| "epoch": 1.1184, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 0.0001296703296703297, |
| "loss": 0.07513262033462524, |
| "mean_token_accuracy": 0.972836098074913, |
| "num_tokens": 1944905.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.08301715180277824, |
| "epoch": 1.1504, |
| "grad_norm": 0.09716796875, |
| "learning_rate": 0.00012747252747252746, |
| "loss": 0.07624064683914185, |
| "mean_token_accuracy": 0.9722976922988892, |
| "num_tokens": 2000057.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.08098908923566342, |
| "epoch": 1.1824, |
| "grad_norm": 0.059814453125, |
| "learning_rate": 0.00012527472527472527, |
| "loss": 0.07458102107048034, |
| "mean_token_accuracy": 0.9721322387456894, |
| "num_tokens": 2055866.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.07686591371893883, |
| "epoch": 1.2144, |
| "grad_norm": 0.06396484375, |
| "learning_rate": 0.0001230769230769231, |
| "loss": 0.07280548810958862, |
| "mean_token_accuracy": 0.972561864554882, |
| "num_tokens": 2111077.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.07761757280677557, |
| "epoch": 1.2464, |
| "grad_norm": 0.07470703125, |
| "learning_rate": 0.00012087912087912087, |
| "loss": 0.07433983087539672, |
| "mean_token_accuracy": 0.9725529655814171, |
| "num_tokens": 2166081.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.08011266030371189, |
| "epoch": 1.2784, |
| "grad_norm": 0.052001953125, |
| "learning_rate": 0.00011868131868131869, |
| "loss": 0.0738287627696991, |
| "mean_token_accuracy": 0.9728635787963867, |
| "num_tokens": 2221310.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.0769817665219307, |
| "epoch": 1.3104, |
| "grad_norm": 0.054931640625, |
| "learning_rate": 0.0001164835164835165, |
| "loss": 0.07387230396270753, |
| "mean_token_accuracy": 0.9729015439748764, |
| "num_tokens": 2277107.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.07817615140229464, |
| "epoch": 1.3424, |
| "grad_norm": 0.06787109375, |
| "learning_rate": 0.00011428571428571428, |
| "loss": 0.07262731790542602, |
| "mean_token_accuracy": 0.9729507148265839, |
| "num_tokens": 2332758.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.07688614577054978, |
| "epoch": 1.3744, |
| "grad_norm": 0.051025390625, |
| "learning_rate": 0.0001120879120879121, |
| "loss": 0.07327454686164855, |
| "mean_token_accuracy": 0.9719040498137475, |
| "num_tokens": 2388461.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.07903551124036312, |
| "epoch": 1.4064, |
| "grad_norm": 0.05126953125, |
| "learning_rate": 0.0001098901098901099, |
| "loss": 0.07202324867248536, |
| "mean_token_accuracy": 0.9729711979627609, |
| "num_tokens": 2443802.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.07504601553082466, |
| "epoch": 1.4384000000000001, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 0.0001076923076923077, |
| "loss": 0.07251456379890442, |
| "mean_token_accuracy": 0.9728567853569985, |
| "num_tokens": 2498886.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.07635734435170889, |
| "epoch": 1.4704, |
| "grad_norm": 0.07861328125, |
| "learning_rate": 0.0001054945054945055, |
| "loss": 0.07308706045150756, |
| "mean_token_accuracy": 0.9728769212961197, |
| "num_tokens": 2554546.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.07706241644918918, |
| "epoch": 1.5024, |
| "grad_norm": 0.052734375, |
| "learning_rate": 0.00010329670329670331, |
| "loss": 0.0728609800338745, |
| "mean_token_accuracy": 0.9726115748286247, |
| "num_tokens": 2609939.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.07556705921888351, |
| "epoch": 1.5344, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.0001010989010989011, |
| "loss": 0.07160326838493347, |
| "mean_token_accuracy": 0.973236757516861, |
| "num_tokens": 2665583.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.07504178639501333, |
| "epoch": 1.5664, |
| "grad_norm": 0.08349609375, |
| "learning_rate": 9.89010989010989e-05, |
| "loss": 0.0718912661075592, |
| "mean_token_accuracy": 0.9726392358541489, |
| "num_tokens": 2721224.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.07667357344180345, |
| "epoch": 1.5984, |
| "grad_norm": 0.044921875, |
| "learning_rate": 9.670329670329671e-05, |
| "loss": 0.07309556603431702, |
| "mean_token_accuracy": 0.9725197270512581, |
| "num_tokens": 2776793.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.07603078782558441, |
| "epoch": 1.6303999999999998, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 9.450549450549451e-05, |
| "loss": 0.07351203560829163, |
| "mean_token_accuracy": 0.9724631071090698, |
| "num_tokens": 2832471.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.07741717118769884, |
| "epoch": 1.6623999999999999, |
| "grad_norm": 0.052001953125, |
| "learning_rate": 9.230769230769232e-05, |
| "loss": 0.07223351001739502, |
| "mean_token_accuracy": 0.9723551839590072, |
| "num_tokens": 2888234.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.07598806507885456, |
| "epoch": 1.6944, |
| "grad_norm": 0.06689453125, |
| "learning_rate": 9.010989010989012e-05, |
| "loss": 0.07230474948883056, |
| "mean_token_accuracy": 0.9722696229815483, |
| "num_tokens": 2943732.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.0750182744115591, |
| "epoch": 1.7264, |
| "grad_norm": 0.04931640625, |
| "learning_rate": 8.791208791208791e-05, |
| "loss": 0.07132035493850708, |
| "mean_token_accuracy": 0.9732247874140739, |
| "num_tokens": 2999654.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.07485976945608855, |
| "epoch": 1.7584, |
| "grad_norm": 0.0478515625, |
| "learning_rate": 8.571428571428571e-05, |
| "loss": 0.07085888981819152, |
| "mean_token_accuracy": 0.9734048008918762, |
| "num_tokens": 3055591.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.07469552531838416, |
| "epoch": 1.7904, |
| "grad_norm": 0.04638671875, |
| "learning_rate": 8.351648351648353e-05, |
| "loss": 0.07132892608642578, |
| "mean_token_accuracy": 0.9727317884564399, |
| "num_tokens": 3111372.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.0737810717895627, |
| "epoch": 1.8224, |
| "grad_norm": 0.052001953125, |
| "learning_rate": 8.131868131868132e-05, |
| "loss": 0.07149158120155334, |
| "mean_token_accuracy": 0.9739102691411972, |
| "num_tokens": 3167376.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.0747382478788495, |
| "epoch": 1.8544, |
| "grad_norm": 0.055908203125, |
| "learning_rate": 7.912087912087912e-05, |
| "loss": 0.07183558940887451, |
| "mean_token_accuracy": 0.972593954205513, |
| "num_tokens": 3222797.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.07589616179466248, |
| "epoch": 1.8864, |
| "grad_norm": 0.04833984375, |
| "learning_rate": 7.692307692307693e-05, |
| "loss": 0.07035009264945984, |
| "mean_token_accuracy": 0.9727584093809127, |
| "num_tokens": 3278083.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.07409894913434982, |
| "epoch": 1.9184, |
| "grad_norm": 0.041015625, |
| "learning_rate": 7.472527472527473e-05, |
| "loss": 0.06983839273452759, |
| "mean_token_accuracy": 0.9737206190824509, |
| "num_tokens": 3334115.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.07298169508576394, |
| "epoch": 1.9504000000000001, |
| "grad_norm": 0.055419921875, |
| "learning_rate": 7.252747252747253e-05, |
| "loss": 0.07096859216690063, |
| "mean_token_accuracy": 0.9732470810413361, |
| "num_tokens": 3389990.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.07362735010683537, |
| "epoch": 1.9824000000000002, |
| "grad_norm": 0.05078125, |
| "learning_rate": 7.032967032967034e-05, |
| "loss": 0.0709508240222931, |
| "mean_token_accuracy": 0.9730613023042679, |
| "num_tokens": 3445703.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.07376520100392793, |
| "epoch": 2.0128, |
| "grad_norm": 0.050048828125, |
| "learning_rate": 6.813186813186814e-05, |
| "loss": 0.06944339275360108, |
| "mean_token_accuracy": 0.973434633330295, |
| "num_tokens": 3498933.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.07382834255695343, |
| "epoch": 2.0448, |
| "grad_norm": 0.048583984375, |
| "learning_rate": 6.593406593406594e-05, |
| "loss": 0.07052375078201294, |
| "mean_token_accuracy": 0.9732005745172501, |
| "num_tokens": 3553934.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.0728354575112462, |
| "epoch": 2.0768, |
| "grad_norm": 0.09619140625, |
| "learning_rate": 6.373626373626373e-05, |
| "loss": 0.07018245458602905, |
| "mean_token_accuracy": 0.9729705214500427, |
| "num_tokens": 3609458.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.07463801130652428, |
| "epoch": 2.1088, |
| "grad_norm": 0.04736328125, |
| "learning_rate": 6.153846153846155e-05, |
| "loss": 0.07015591859817505, |
| "mean_token_accuracy": 0.9737546548247338, |
| "num_tokens": 3664572.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.07311667818576098, |
| "epoch": 2.1408, |
| "grad_norm": 0.051513671875, |
| "learning_rate": 5.9340659340659345e-05, |
| "loss": 0.06875128149986268, |
| "mean_token_accuracy": 0.9733711332082748, |
| "num_tokens": 3720225.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.0712715208530426, |
| "epoch": 2.1728, |
| "grad_norm": 0.0537109375, |
| "learning_rate": 5.714285714285714e-05, |
| "loss": 0.06806424856185914, |
| "mean_token_accuracy": 0.9737527936697006, |
| "num_tokens": 3776424.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.07198722306638956, |
| "epoch": 2.2048, |
| "grad_norm": 0.046630859375, |
| "learning_rate": 5.494505494505495e-05, |
| "loss": 0.06764371991157532, |
| "mean_token_accuracy": 0.9738867044448852, |
| "num_tokens": 3832496.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.07190853431820869, |
| "epoch": 2.2368, |
| "grad_norm": 0.0498046875, |
| "learning_rate": 5.274725274725275e-05, |
| "loss": 0.06873984336853027, |
| "mean_token_accuracy": 0.9736966788768768, |
| "num_tokens": 3888101.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.07232791539281606, |
| "epoch": 2.2688, |
| "grad_norm": 0.048828125, |
| "learning_rate": 5.054945054945055e-05, |
| "loss": 0.0678622543811798, |
| "mean_token_accuracy": 0.9740407422184945, |
| "num_tokens": 3944308.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.07142861131578684, |
| "epoch": 2.3008, |
| "grad_norm": 0.04931640625, |
| "learning_rate": 4.8351648351648355e-05, |
| "loss": 0.06784402132034302, |
| "mean_token_accuracy": 0.9742121011018753, |
| "num_tokens": 4000402.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.07213470414280891, |
| "epoch": 2.3327999999999998, |
| "grad_norm": 0.053466796875, |
| "learning_rate": 4.615384615384616e-05, |
| "loss": 0.0697720229625702, |
| "mean_token_accuracy": 0.9730553776025772, |
| "num_tokens": 4055460.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.07274228539317847, |
| "epoch": 2.3648, |
| "grad_norm": 0.05126953125, |
| "learning_rate": 4.3956043956043955e-05, |
| "loss": 0.06925151348114014, |
| "mean_token_accuracy": 0.9731604158878326, |
| "num_tokens": 4110709.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.0725497305393219, |
| "epoch": 2.3968, |
| "grad_norm": 0.05029296875, |
| "learning_rate": 4.1758241758241765e-05, |
| "loss": 0.0686568260192871, |
| "mean_token_accuracy": 0.9741124615073204, |
| "num_tokens": 4166373.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.07181228250265122, |
| "epoch": 2.4288, |
| "grad_norm": 0.05029296875, |
| "learning_rate": 3.956043956043956e-05, |
| "loss": 0.06826226711273194, |
| "mean_token_accuracy": 0.9736026957631111, |
| "num_tokens": 4222123.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.0726727832108736, |
| "epoch": 2.4608, |
| "grad_norm": 0.046142578125, |
| "learning_rate": 3.7362637362637365e-05, |
| "loss": 0.06845790147781372, |
| "mean_token_accuracy": 0.9734082207083702, |
| "num_tokens": 4277783.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.0728993572294712, |
| "epoch": 2.4928, |
| "grad_norm": 0.050537109375, |
| "learning_rate": 3.516483516483517e-05, |
| "loss": 0.06819941997528076, |
| "mean_token_accuracy": 0.9740725710988045, |
| "num_tokens": 4333255.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.07271347604691983, |
| "epoch": 2.5248, |
| "grad_norm": 0.0546875, |
| "learning_rate": 3.296703296703297e-05, |
| "loss": 0.06898128986358643, |
| "mean_token_accuracy": 0.9741450414061547, |
| "num_tokens": 4388480.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.07090196693316102, |
| "epoch": 2.5568, |
| "grad_norm": 0.04638671875, |
| "learning_rate": 3.0769230769230774e-05, |
| "loss": 0.06766563653945923, |
| "mean_token_accuracy": 0.9739843040704728, |
| "num_tokens": 4444676.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.0716133133508265, |
| "epoch": 2.5888, |
| "grad_norm": 0.050537109375, |
| "learning_rate": 2.857142857142857e-05, |
| "loss": 0.06745712161064148, |
| "mean_token_accuracy": 0.9742712348699569, |
| "num_tokens": 4500509.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.07198168560862542, |
| "epoch": 2.6208, |
| "grad_norm": 0.052734375, |
| "learning_rate": 2.6373626373626374e-05, |
| "loss": 0.06861351728439331, |
| "mean_token_accuracy": 0.9730511695146561, |
| "num_tokens": 4555738.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.07194693582132458, |
| "epoch": 2.6528, |
| "grad_norm": 0.055419921875, |
| "learning_rate": 2.4175824175824177e-05, |
| "loss": 0.06737480759620666, |
| "mean_token_accuracy": 0.9742139622569084, |
| "num_tokens": 4611454.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.07176698800176382, |
| "epoch": 2.6848, |
| "grad_norm": 0.0537109375, |
| "learning_rate": 2.1978021978021977e-05, |
| "loss": 0.06750304102897645, |
| "mean_token_accuracy": 0.9739004611968994, |
| "num_tokens": 4667407.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.07237117197364569, |
| "epoch": 2.7168, |
| "grad_norm": 0.04931640625, |
| "learning_rate": 1.978021978021978e-05, |
| "loss": 0.06796355247497558, |
| "mean_token_accuracy": 0.9741956070065498, |
| "num_tokens": 4723097.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.07233156580477954, |
| "epoch": 2.7488, |
| "grad_norm": 0.07666015625, |
| "learning_rate": 1.7582417582417584e-05, |
| "loss": 0.06823940873146057, |
| "mean_token_accuracy": 0.9739502936601638, |
| "num_tokens": 4778509.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.07178980130702257, |
| "epoch": 2.7808, |
| "grad_norm": 0.056396484375, |
| "learning_rate": 1.5384615384615387e-05, |
| "loss": 0.06750970482826232, |
| "mean_token_accuracy": 0.9740586042404175, |
| "num_tokens": 4834011.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.07171082906425, |
| "epoch": 2.8128, |
| "grad_norm": 0.053955078125, |
| "learning_rate": 1.3186813186813187e-05, |
| "loss": 0.06713088154792786, |
| "mean_token_accuracy": 0.9745032519102097, |
| "num_tokens": 4889429.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.07134337816387415, |
| "epoch": 2.8448, |
| "grad_norm": 0.056884765625, |
| "learning_rate": 1.0989010989010989e-05, |
| "loss": 0.06686720848083497, |
| "mean_token_accuracy": 0.9745988816022872, |
| "num_tokens": 4944966.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.07198897190392017, |
| "epoch": 2.8768000000000002, |
| "grad_norm": 0.051513671875, |
| "learning_rate": 8.791208791208792e-06, |
| "loss": 0.06727443933486939, |
| "mean_token_accuracy": 0.9737225085496902, |
| "num_tokens": 5000546.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.07114618215709925, |
| "epoch": 2.9088000000000003, |
| "grad_norm": 0.04833984375, |
| "learning_rate": 6.5934065934065935e-06, |
| "loss": 0.0675000011920929, |
| "mean_token_accuracy": 0.9743592411279678, |
| "num_tokens": 5056400.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.07034891471266747, |
| "epoch": 2.9408, |
| "grad_norm": 0.05224609375, |
| "learning_rate": 4.395604395604396e-06, |
| "loss": 0.06710875034332275, |
| "mean_token_accuracy": 0.9740387976169587, |
| "num_tokens": 5111854.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.07230036649852992, |
| "epoch": 2.9728, |
| "grad_norm": 0.05419921875, |
| "learning_rate": 2.197802197802198e-06, |
| "loss": 0.06789053678512573, |
| "mean_token_accuracy": 0.9737495318055153, |
| "num_tokens": 5167285.0, |
| "step": 930 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 939, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.4252630664691712e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|