| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 625, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0016, | |
| "grad_norm": 10.975202560424805, | |
| "learning_rate": 0.0, | |
| "loss": 4.4556, | |
| "mean_token_accuracy": 0.30622560530900955, | |
| "num_tokens": 6446.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 11.116790771484375, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 4.4789, | |
| "mean_token_accuracy": 0.30873720347881317, | |
| "num_tokens": 12801.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0048, | |
| "grad_norm": 11.020855903625488, | |
| "learning_rate": 2.105263157894737e-05, | |
| "loss": 4.4585, | |
| "mean_token_accuracy": 0.3004137650132179, | |
| "num_tokens": 19243.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 10.774847030639648, | |
| "learning_rate": 3.157894736842105e-05, | |
| "loss": 4.3996, | |
| "mean_token_accuracy": 0.3128996789455414, | |
| "num_tokens": 25736.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 11.36635684967041, | |
| "learning_rate": 4.210526315789474e-05, | |
| "loss": 4.4172, | |
| "mean_token_accuracy": 0.30812356621026993, | |
| "num_tokens": 32108.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 10.182173728942871, | |
| "learning_rate": 5.2631578947368424e-05, | |
| "loss": 4.2889, | |
| "mean_token_accuracy": 0.32208921015262604, | |
| "num_tokens": 38450.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0112, | |
| "grad_norm": 8.852214813232422, | |
| "learning_rate": 6.31578947368421e-05, | |
| "loss": 4.0288, | |
| "mean_token_accuracy": 0.34315764904022217, | |
| "num_tokens": 44839.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 7.8686909675598145, | |
| "learning_rate": 7.368421052631579e-05, | |
| "loss": 3.8561, | |
| "mean_token_accuracy": 0.35557373613119125, | |
| "num_tokens": 51269.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0144, | |
| "grad_norm": 7.373384952545166, | |
| "learning_rate": 8.421052631578948e-05, | |
| "loss": 3.6472, | |
| "mean_token_accuracy": 0.40004485100507736, | |
| "num_tokens": 57623.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 6.777977466583252, | |
| "learning_rate": 9.473684210526316e-05, | |
| "loss": 3.4499, | |
| "mean_token_accuracy": 0.4237586483359337, | |
| "num_tokens": 63937.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0176, | |
| "grad_norm": 6.255558013916016, | |
| "learning_rate": 0.00010526315789473685, | |
| "loss": 3.2124, | |
| "mean_token_accuracy": 0.45736898481845856, | |
| "num_tokens": 70262.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 6.936082363128662, | |
| "learning_rate": 0.00011578947368421053, | |
| "loss": 2.9288, | |
| "mean_token_accuracy": 0.5076272785663605, | |
| "num_tokens": 76638.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0208, | |
| "grad_norm": 5.14530611038208, | |
| "learning_rate": 0.0001263157894736842, | |
| "loss": 2.6819, | |
| "mean_token_accuracy": 0.5418585389852524, | |
| "num_tokens": 83006.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 4.220037460327148, | |
| "learning_rate": 0.0001368421052631579, | |
| "loss": 2.4228, | |
| "mean_token_accuracy": 0.5702001601457596, | |
| "num_tokens": 89416.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 4.017765998840332, | |
| "learning_rate": 0.00014736842105263158, | |
| "loss": 2.152, | |
| "mean_token_accuracy": 0.5957600474357605, | |
| "num_tokens": 95867.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 3.0187978744506836, | |
| "learning_rate": 0.00015789473684210527, | |
| "loss": 1.9412, | |
| "mean_token_accuracy": 0.6025321185588837, | |
| "num_tokens": 102128.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0272, | |
| "grad_norm": 3.0175063610076904, | |
| "learning_rate": 0.00016842105263157895, | |
| "loss": 1.8326, | |
| "mean_token_accuracy": 0.5870110392570496, | |
| "num_tokens": 108492.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 2.52016282081604, | |
| "learning_rate": 0.00017894736842105264, | |
| "loss": 1.7301, | |
| "mean_token_accuracy": 0.5975689738988876, | |
| "num_tokens": 114941.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0304, | |
| "grad_norm": 2.2820112705230713, | |
| "learning_rate": 0.00018947368421052632, | |
| "loss": 1.6282, | |
| "mean_token_accuracy": 0.6018220335245132, | |
| "num_tokens": 121380.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 2.29760479927063, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4959, | |
| "mean_token_accuracy": 0.6502654552459717, | |
| "num_tokens": 127819.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0336, | |
| "grad_norm": 2.2491250038146973, | |
| "learning_rate": 0.00019999879061093312, | |
| "loss": 1.3419, | |
| "mean_token_accuracy": 0.7002265751361847, | |
| "num_tokens": 134293.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 2.235426187515259, | |
| "learning_rate": 0.0001999951624762352, | |
| "loss": 1.201, | |
| "mean_token_accuracy": 0.7342203259468079, | |
| "num_tokens": 140724.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0368, | |
| "grad_norm": 2.037572145462036, | |
| "learning_rate": 0.00019998911569341348, | |
| "loss": 1.0607, | |
| "mean_token_accuracy": 0.769325003027916, | |
| "num_tokens": 147156.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 1.93583083152771, | |
| "learning_rate": 0.0001999806504249771, | |
| "loss": 0.8949, | |
| "mean_token_accuracy": 0.8047759681940079, | |
| "num_tokens": 153432.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.6834540367126465, | |
| "learning_rate": 0.00019996976689843287, | |
| "loss": 0.7485, | |
| "mean_token_accuracy": 0.8383188247680664, | |
| "num_tokens": 159801.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 1.5332688093185425, | |
| "learning_rate": 0.0001999564654062789, | |
| "loss": 0.6339, | |
| "mean_token_accuracy": 0.8577111959457397, | |
| "num_tokens": 166141.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0432, | |
| "grad_norm": 1.5257647037506104, | |
| "learning_rate": 0.00019994074630599705, | |
| "loss": 0.5286, | |
| "mean_token_accuracy": 0.8894002586603165, | |
| "num_tokens": 172492.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 1.469301700592041, | |
| "learning_rate": 0.00019992261002004294, | |
| "loss": 0.4447, | |
| "mean_token_accuracy": 0.9105815589427948, | |
| "num_tokens": 178887.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0464, | |
| "grad_norm": 1.2936886548995972, | |
| "learning_rate": 0.00019990205703583497, | |
| "loss": 0.3612, | |
| "mean_token_accuracy": 0.9369391947984695, | |
| "num_tokens": 185349.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 1.0115550756454468, | |
| "learning_rate": 0.00019987908790574104, | |
| "loss": 0.2562, | |
| "mean_token_accuracy": 0.9557851552963257, | |
| "num_tokens": 191651.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0496, | |
| "grad_norm": 0.9288735389709473, | |
| "learning_rate": 0.00019985370324706366, | |
| "loss": 0.2382, | |
| "mean_token_accuracy": 0.9607889354228973, | |
| "num_tokens": 198036.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 1.2347880601882935, | |
| "learning_rate": 0.0001998259037420235, | |
| "loss": 0.2172, | |
| "mean_token_accuracy": 0.9599845558404922, | |
| "num_tokens": 204399.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.0528, | |
| "grad_norm": 0.7188531756401062, | |
| "learning_rate": 0.00019979569013774093, | |
| "loss": 0.2326, | |
| "mean_token_accuracy": 0.9579280912876129, | |
| "num_tokens": 210784.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 0.48960864543914795, | |
| "learning_rate": 0.000199763063246216, | |
| "loss": 0.1875, | |
| "mean_token_accuracy": 0.9625789076089859, | |
| "num_tokens": 217138.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.568114161491394, | |
| "learning_rate": 0.00019972802394430664, | |
| "loss": 0.1968, | |
| "mean_token_accuracy": 0.9585353434085846, | |
| "num_tokens": 223591.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 0.42759305238723755, | |
| "learning_rate": 0.00019969057317370504, | |
| "loss": 0.1993, | |
| "mean_token_accuracy": 0.961366206407547, | |
| "num_tokens": 229978.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.0592, | |
| "grad_norm": 0.5354682803153992, | |
| "learning_rate": 0.00019965071194091237, | |
| "loss": 0.1934, | |
| "mean_token_accuracy": 0.957531601190567, | |
| "num_tokens": 236384.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 0.3905318081378937, | |
| "learning_rate": 0.00019960844131721171, | |
| "loss": 0.186, | |
| "mean_token_accuracy": 0.9626945108175278, | |
| "num_tokens": 242854.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0624, | |
| "grad_norm": 0.4046100378036499, | |
| "learning_rate": 0.00019956376243863926, | |
| "loss": 0.174, | |
| "mean_token_accuracy": 0.964872881770134, | |
| "num_tokens": 249242.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.40250322222709656, | |
| "learning_rate": 0.00019951667650595388, | |
| "loss": 0.1791, | |
| "mean_token_accuracy": 0.9627016186714172, | |
| "num_tokens": 255753.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0656, | |
| "grad_norm": 0.41779595613479614, | |
| "learning_rate": 0.00019946718478460474, | |
| "loss": 0.173, | |
| "mean_token_accuracy": 0.9643158912658691, | |
| "num_tokens": 262093.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 0.36981409788131714, | |
| "learning_rate": 0.0001994152886046973, | |
| "loss": 0.1612, | |
| "mean_token_accuracy": 0.9648505598306656, | |
| "num_tokens": 268469.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0688, | |
| "grad_norm": 0.3900979459285736, | |
| "learning_rate": 0.00019936098936095765, | |
| "loss": 0.1548, | |
| "mean_token_accuracy": 0.9689272940158844, | |
| "num_tokens": 274814.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 0.3215242624282837, | |
| "learning_rate": 0.00019930428851269488, | |
| "loss": 0.1837, | |
| "mean_token_accuracy": 0.96176777780056, | |
| "num_tokens": 281260.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.3260638117790222, | |
| "learning_rate": 0.00019924518758376208, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9645452350378036, | |
| "num_tokens": 287604.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 0.302793025970459, | |
| "learning_rate": 0.00019918368816251514, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9634306281805038, | |
| "num_tokens": 294006.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.0752, | |
| "grad_norm": 0.3088330030441284, | |
| "learning_rate": 0.00019911979190177028, | |
| "loss": 0.1617, | |
| "mean_token_accuracy": 0.9675635695457458, | |
| "num_tokens": 300456.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 0.28030574321746826, | |
| "learning_rate": 0.0001990535005187594, | |
| "loss": 0.1652, | |
| "mean_token_accuracy": 0.9627850204706192, | |
| "num_tokens": 306839.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.0784, | |
| "grad_norm": 0.2845500111579895, | |
| "learning_rate": 0.00019898481579508421, | |
| "loss": 0.1789, | |
| "mean_token_accuracy": 0.9626674503087997, | |
| "num_tokens": 313193.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.254579097032547, | |
| "learning_rate": 0.0001989137395766681, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.9689367115497589, | |
| "num_tokens": 319576.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0816, | |
| "grad_norm": 0.3165920376777649, | |
| "learning_rate": 0.00019884027377370668, | |
| "loss": 0.1822, | |
| "mean_token_accuracy": 0.9593835771083832, | |
| "num_tokens": 326119.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 0.39432063698768616, | |
| "learning_rate": 0.0001987644203606164, | |
| "loss": 0.1775, | |
| "mean_token_accuracy": 0.961280420422554, | |
| "num_tokens": 332595.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.0848, | |
| "grad_norm": 0.2616019546985626, | |
| "learning_rate": 0.00019868618137598132, | |
| "loss": 0.1591, | |
| "mean_token_accuracy": 0.965641126036644, | |
| "num_tokens": 338978.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 0.2596780061721802, | |
| "learning_rate": 0.00019860555892249875, | |
| "loss": 0.1634, | |
| "mean_token_accuracy": 0.9624787867069244, | |
| "num_tokens": 345434.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 0.2394765019416809, | |
| "learning_rate": 0.00019852255516692225, | |
| "loss": 0.1484, | |
| "mean_token_accuracy": 0.9672086089849472, | |
| "num_tokens": 351777.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 0.25694701075553894, | |
| "learning_rate": 0.00019843717234000374, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9624592810869217, | |
| "num_tokens": 358202.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.0912, | |
| "grad_norm": 0.26259931921958923, | |
| "learning_rate": 0.00019834941273643336, | |
| "loss": 0.1445, | |
| "mean_token_accuracy": 0.9688271433115005, | |
| "num_tokens": 364547.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 0.28787609934806824, | |
| "learning_rate": 0.0001982592787147779, | |
| "loss": 0.1752, | |
| "mean_token_accuracy": 0.9638708084821701, | |
| "num_tokens": 371055.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.0944, | |
| "grad_norm": 0.21661171317100525, | |
| "learning_rate": 0.00019816677269741733, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.9649683982133865, | |
| "num_tokens": 377434.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.25617703795433044, | |
| "learning_rate": 0.00019807189717047986, | |
| "loss": 0.1557, | |
| "mean_token_accuracy": 0.9660284668207169, | |
| "num_tokens": 383863.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0976, | |
| "grad_norm": 0.2642766833305359, | |
| "learning_rate": 0.0001979746546837749, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9613157361745834, | |
| "num_tokens": 390279.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 0.24200187623500824, | |
| "learning_rate": 0.00019787504785072463, | |
| "loss": 0.1616, | |
| "mean_token_accuracy": 0.9648346453905106, | |
| "num_tokens": 396749.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.1008, | |
| "grad_norm": 0.24220506846904755, | |
| "learning_rate": 0.0001977730793482939, | |
| "loss": 0.1543, | |
| "mean_token_accuracy": 0.9650003463029861, | |
| "num_tokens": 403078.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 0.2305448353290558, | |
| "learning_rate": 0.00019766875191691802, | |
| "loss": 0.1463, | |
| "mean_token_accuracy": 0.9669727087020874, | |
| "num_tokens": 409537.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 0.20737282931804657, | |
| "learning_rate": 0.00019756206836042938, | |
| "loss": 0.1547, | |
| "mean_token_accuracy": 0.9661688953638077, | |
| "num_tokens": 416078.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 0.21056829392910004, | |
| "learning_rate": 0.00019745303154598186, | |
| "loss": 0.1399, | |
| "mean_token_accuracy": 0.9682707488536835, | |
| "num_tokens": 422478.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.1072, | |
| "grad_norm": 0.22398342192173004, | |
| "learning_rate": 0.00019734164440397397, | |
| "loss": 0.1426, | |
| "mean_token_accuracy": 0.9672913551330566, | |
| "num_tokens": 428818.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 0.20356997847557068, | |
| "learning_rate": 0.00019722790992796995, | |
| "loss": 0.1417, | |
| "mean_token_accuracy": 0.9669185429811478, | |
| "num_tokens": 435184.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.1104, | |
| "grad_norm": 0.2131126970052719, | |
| "learning_rate": 0.00019711183117461942, | |
| "loss": 0.1367, | |
| "mean_token_accuracy": 0.9693567156791687, | |
| "num_tokens": 441562.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.19927483797073364, | |
| "learning_rate": 0.00019699341126357513, | |
| "loss": 0.1462, | |
| "mean_token_accuracy": 0.9673868119716644, | |
| "num_tokens": 447896.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1136, | |
| "grad_norm": 0.1976092904806137, | |
| "learning_rate": 0.0001968726533774092, | |
| "loss": 0.1539, | |
| "mean_token_accuracy": 0.9651944190263748, | |
| "num_tokens": 454307.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 0.23401835560798645, | |
| "learning_rate": 0.00019674956076152762, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.962654784321785, | |
| "num_tokens": 460768.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.1168, | |
| "grad_norm": 0.23897838592529297, | |
| "learning_rate": 0.00019662413672408288, | |
| "loss": 0.1507, | |
| "mean_token_accuracy": 0.9647169411182404, | |
| "num_tokens": 467195.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 0.21890956163406372, | |
| "learning_rate": 0.00019649638463588523, | |
| "loss": 0.147, | |
| "mean_token_accuracy": 0.9696839302778244, | |
| "num_tokens": 473450.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.21364597976207733, | |
| "learning_rate": 0.00019636630793031193, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.9646295458078384, | |
| "num_tokens": 479938.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 0.20200735330581665, | |
| "learning_rate": 0.0001962339101032151, | |
| "loss": 0.129, | |
| "mean_token_accuracy": 0.9713664948940277, | |
| "num_tokens": 486337.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.1232, | |
| "grad_norm": 0.21064621210098267, | |
| "learning_rate": 0.0001960991947128278, | |
| "loss": 0.1432, | |
| "mean_token_accuracy": 0.9702709466218948, | |
| "num_tokens": 492640.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 0.20065751671791077, | |
| "learning_rate": 0.00019596216537966818, | |
| "loss": 0.1364, | |
| "mean_token_accuracy": 0.9683724045753479, | |
| "num_tokens": 499072.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.1264, | |
| "grad_norm": 0.2167198210954666, | |
| "learning_rate": 0.00019582282578644244, | |
| "loss": 0.1461, | |
| "mean_token_accuracy": 0.9670031070709229, | |
| "num_tokens": 505595.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.21613095700740814, | |
| "learning_rate": 0.0001956811796779457, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.966914176940918, | |
| "num_tokens": 511983.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1296, | |
| "grad_norm": 0.22970137000083923, | |
| "learning_rate": 0.00019553723086096142, | |
| "loss": 0.138, | |
| "mean_token_accuracy": 0.9701613485813141, | |
| "num_tokens": 518332.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 0.19586379826068878, | |
| "learning_rate": 0.00019539098320415902, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9686425179243088, | |
| "num_tokens": 524784.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.1328, | |
| "grad_norm": 0.20189055800437927, | |
| "learning_rate": 0.00019524244063799003, | |
| "loss": 0.1437, | |
| "mean_token_accuracy": 0.9652739316225052, | |
| "num_tokens": 531280.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 0.23265193402767181, | |
| "learning_rate": 0.00019509160715458233, | |
| "loss": 0.1403, | |
| "mean_token_accuracy": 0.9670985192060471, | |
| "num_tokens": 537621.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 0.190333291888237, | |
| "learning_rate": 0.0001949384868076329, | |
| "loss": 0.1249, | |
| "mean_token_accuracy": 0.9695749282836914, | |
| "num_tokens": 543978.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 0.19643469154834747, | |
| "learning_rate": 0.000194783083712299, | |
| "loss": 0.1314, | |
| "mean_token_accuracy": 0.9683917015790939, | |
| "num_tokens": 550418.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.1392, | |
| "grad_norm": 0.19404257833957672, | |
| "learning_rate": 0.00019462540204508738, | |
| "loss": 0.1455, | |
| "mean_token_accuracy": 0.9701170474290848, | |
| "num_tokens": 556758.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 0.19332736730575562, | |
| "learning_rate": 0.00019446544604374215, | |
| "loss": 0.1275, | |
| "mean_token_accuracy": 0.9712614119052887, | |
| "num_tokens": 563171.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.1424, | |
| "grad_norm": 0.20570771396160126, | |
| "learning_rate": 0.00019430322000713076, | |
| "loss": 0.1357, | |
| "mean_token_accuracy": 0.9676992744207382, | |
| "num_tokens": 569552.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.2184784710407257, | |
| "learning_rate": 0.00019413872829512874, | |
| "loss": 0.1461, | |
| "mean_token_accuracy": 0.964031919836998, | |
| "num_tokens": 576063.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1456, | |
| "grad_norm": 0.21943870186805725, | |
| "learning_rate": 0.00019397197532850224, | |
| "loss": 0.1415, | |
| "mean_token_accuracy": 0.9678437560796738, | |
| "num_tokens": 582476.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 0.23238135874271393, | |
| "learning_rate": 0.0001938029655887894, | |
| "loss": 0.143, | |
| "mean_token_accuracy": 0.9663237482309341, | |
| "num_tokens": 588931.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.1488, | |
| "grad_norm": 0.2303759604692459, | |
| "learning_rate": 0.00019363170361817971, | |
| "loss": 0.1425, | |
| "mean_token_accuracy": 0.9642046988010406, | |
| "num_tokens": 595430.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 0.2182934582233429, | |
| "learning_rate": 0.00019345819401939227, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9646277278661728, | |
| "num_tokens": 601885.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 0.22399112582206726, | |
| "learning_rate": 0.00019328244145555177, | |
| "loss": 0.1463, | |
| "mean_token_accuracy": 0.9659337252378464, | |
| "num_tokens": 608346.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 0.20376521348953247, | |
| "learning_rate": 0.00019310445065006323, | |
| "loss": 0.1482, | |
| "mean_token_accuracy": 0.9651739150285721, | |
| "num_tokens": 614798.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.1552, | |
| "grad_norm": 0.19498969614505768, | |
| "learning_rate": 0.00019292422638648527, | |
| "loss": 0.1403, | |
| "mean_token_accuracy": 0.967052087187767, | |
| "num_tokens": 621279.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 0.1908182054758072, | |
| "learning_rate": 0.00019274177350840125, | |
| "loss": 0.1531, | |
| "mean_token_accuracy": 0.9672629833221436, | |
| "num_tokens": 627692.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.1584, | |
| "grad_norm": 0.2237246036529541, | |
| "learning_rate": 0.0001925570969192894, | |
| "loss": 0.1412, | |
| "mean_token_accuracy": 0.9683457463979721, | |
| "num_tokens": 634074.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.23452997207641602, | |
| "learning_rate": 0.00019237020158239065, | |
| "loss": 0.1704, | |
| "mean_token_accuracy": 0.9619830250740051, | |
| "num_tokens": 640627.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1616, | |
| "grad_norm": 0.23753929138183594, | |
| "learning_rate": 0.0001921810925205757, | |
| "loss": 0.1315, | |
| "mean_token_accuracy": 0.9682804495096207, | |
| "num_tokens": 647084.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 0.1882672756910324, | |
| "learning_rate": 0.00019198977481620967, | |
| "loss": 0.1229, | |
| "mean_token_accuracy": 0.9720747470855713, | |
| "num_tokens": 653436.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.1648, | |
| "grad_norm": 0.21557772159576416, | |
| "learning_rate": 0.00019179625361101564, | |
| "loss": 0.1415, | |
| "mean_token_accuracy": 0.9670463502407074, | |
| "num_tokens": 659791.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 0.2052602469921112, | |
| "learning_rate": 0.00019160053410593653, | |
| "loss": 0.1411, | |
| "mean_token_accuracy": 0.9689261317253113, | |
| "num_tokens": 666173.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 0.19542425870895386, | |
| "learning_rate": 0.0001914026215609952, | |
| "loss": 0.1357, | |
| "mean_token_accuracy": 0.9687201231718063, | |
| "num_tokens": 672482.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 0.18486933410167694, | |
| "learning_rate": 0.00019120252129515322, | |
| "loss": 0.1256, | |
| "mean_token_accuracy": 0.9707110822200775, | |
| "num_tokens": 678806.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.1712, | |
| "grad_norm": 0.21281570196151733, | |
| "learning_rate": 0.0001910002386861677, | |
| "loss": 0.1442, | |
| "mean_token_accuracy": 0.9668544679880142, | |
| "num_tokens": 685217.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 0.21555405855178833, | |
| "learning_rate": 0.00019079577917044705, | |
| "loss": 0.1484, | |
| "mean_token_accuracy": 0.9638047218322754, | |
| "num_tokens": 691799.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.1744, | |
| "grad_norm": 0.20796743035316467, | |
| "learning_rate": 0.00019058914824290465, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9686572104692459, | |
| "num_tokens": 698210.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.22029848396778107, | |
| "learning_rate": 0.00019038035145681125, | |
| "loss": 0.1332, | |
| "mean_token_accuracy": 0.9675081074237823, | |
| "num_tokens": 704677.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1776, | |
| "grad_norm": 0.2398574948310852, | |
| "learning_rate": 0.00019016939442364578, | |
| "loss": 0.1614, | |
| "mean_token_accuracy": 0.9642173200845718, | |
| "num_tokens": 711157.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 0.2306092381477356, | |
| "learning_rate": 0.00018995628281294442, | |
| "loss": 0.1507, | |
| "mean_token_accuracy": 0.9684281200170517, | |
| "num_tokens": 717533.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.1808, | |
| "grad_norm": 0.19873455166816711, | |
| "learning_rate": 0.00018974102235214834, | |
| "loss": 0.1345, | |
| "mean_token_accuracy": 0.9662847220897675, | |
| "num_tokens": 724058.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 0.20419520139694214, | |
| "learning_rate": 0.0001895236188264497, | |
| "loss": 0.1251, | |
| "mean_token_accuracy": 0.9721644371747971, | |
| "num_tokens": 730398.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 0.2063436657190323, | |
| "learning_rate": 0.00018930407807863628, | |
| "loss": 0.1217, | |
| "mean_token_accuracy": 0.973570704460144, | |
| "num_tokens": 736729.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 0.21594303846359253, | |
| "learning_rate": 0.00018908240600893419, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.9632740765810013, | |
| "num_tokens": 743231.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.1872, | |
| "grad_norm": 0.23267535865306854, | |
| "learning_rate": 0.00018885860857484972, | |
| "loss": 0.1576, | |
| "mean_token_accuracy": 0.9640915095806122, | |
| "num_tokens": 749705.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 0.19092179834842682, | |
| "learning_rate": 0.00018863269179100875, | |
| "loss": 0.1209, | |
| "mean_token_accuracy": 0.9690503478050232, | |
| "num_tokens": 756177.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.1904, | |
| "grad_norm": 0.2067628651857376, | |
| "learning_rate": 0.00018840466172899553, | |
| "loss": 0.1263, | |
| "mean_token_accuracy": 0.9677437245845795, | |
| "num_tokens": 762668.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.19694143533706665, | |
| "learning_rate": 0.00018817452451718928, | |
| "loss": 0.1303, | |
| "mean_token_accuracy": 0.9699513912200928, | |
| "num_tokens": 769075.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1936, | |
| "grad_norm": 0.22565840184688568, | |
| "learning_rate": 0.0001879422863405995, | |
| "loss": 0.1251, | |
| "mean_token_accuracy": 0.969616174697876, | |
| "num_tokens": 775311.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 0.21298082172870636, | |
| "learning_rate": 0.00018770795344069972, | |
| "loss": 0.1338, | |
| "mean_token_accuracy": 0.9679106324911118, | |
| "num_tokens": 781744.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.1968, | |
| "grad_norm": 0.22187118232250214, | |
| "learning_rate": 0.00018747153211525996, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.9675947576761246, | |
| "num_tokens": 788168.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 0.18820776045322418, | |
| "learning_rate": 0.00018723302871817717, | |
| "loss": 0.1427, | |
| "mean_token_accuracy": 0.9689621776342392, | |
| "num_tokens": 794517.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.21196430921554565, | |
| "learning_rate": 0.00018699244965930475, | |
| "loss": 0.1446, | |
| "mean_token_accuracy": 0.9651871025562286, | |
| "num_tokens": 800972.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 0.23796945810317993, | |
| "learning_rate": 0.00018674980140428012, | |
| "loss": 0.1462, | |
| "mean_token_accuracy": 0.9649701118469238, | |
| "num_tokens": 807427.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.2032, | |
| "grad_norm": 0.20141765475273132, | |
| "learning_rate": 0.00018650509047435094, | |
| "loss": 0.1372, | |
| "mean_token_accuracy": 0.9681167155504227, | |
| "num_tokens": 813827.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 0.19517339766025543, | |
| "learning_rate": 0.0001862583234462, | |
| "loss": 0.123, | |
| "mean_token_accuracy": 0.968400165438652, | |
| "num_tokens": 820335.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.2064, | |
| "grad_norm": 0.23840022087097168, | |
| "learning_rate": 0.00018600950695176827, | |
| "loss": 0.1307, | |
| "mean_token_accuracy": 0.969053253531456, | |
| "num_tokens": 826813.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.2128070741891861, | |
| "learning_rate": 0.00018575864767807684, | |
| "loss": 0.1334, | |
| "mean_token_accuracy": 0.9681389629840851, | |
| "num_tokens": 833184.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2096, | |
| "grad_norm": 0.20229843258857727, | |
| "learning_rate": 0.00018550575236704712, | |
| "loss": 0.1419, | |
| "mean_token_accuracy": 0.9684087783098221, | |
| "num_tokens": 839585.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 0.19595421850681305, | |
| "learning_rate": 0.00018525082781531963, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.9709577709436417, | |
| "num_tokens": 845916.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.2128, | |
| "grad_norm": 0.22639180719852448, | |
| "learning_rate": 0.00018499388087407138, | |
| "loss": 0.133, | |
| "mean_token_accuracy": 0.9666442424058914, | |
| "num_tokens": 852341.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 0.22204461693763733, | |
| "learning_rate": 0.00018473491844883178, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.969649538397789, | |
| "num_tokens": 858682.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 0.2082476168870926, | |
| "learning_rate": 0.00018447394749929692, | |
| "loss": 0.1228, | |
| "mean_token_accuracy": 0.9682896137237549, | |
| "num_tokens": 865099.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 0.239635169506073, | |
| "learning_rate": 0.00018421097503914266, | |
| "loss": 0.1513, | |
| "mean_token_accuracy": 0.9648119211196899, | |
| "num_tokens": 871585.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.2192, | |
| "grad_norm": 0.21992094814777374, | |
| "learning_rate": 0.00018394600813583607, | |
| "loss": 0.1301, | |
| "mean_token_accuracy": 0.9676744788885117, | |
| "num_tokens": 877938.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 0.20596052706241608, | |
| "learning_rate": 0.00018367905391044549, | |
| "loss": 0.1403, | |
| "mean_token_accuracy": 0.9667653441429138, | |
| "num_tokens": 884352.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.2224, | |
| "grad_norm": 0.21477219462394714, | |
| "learning_rate": 0.00018341011953744923, | |
| "loss": 0.1188, | |
| "mean_token_accuracy": 0.9719399064779282, | |
| "num_tokens": 890729.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.22649620473384857, | |
| "learning_rate": 0.00018313921224454252, | |
| "loss": 0.1147, | |
| "mean_token_accuracy": 0.9703161418437958, | |
| "num_tokens": 897082.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2256, | |
| "grad_norm": 0.21603061258792877, | |
| "learning_rate": 0.00018286633931244366, | |
| "loss": 0.1306, | |
| "mean_token_accuracy": 0.9716930836439133, | |
| "num_tokens": 903528.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 0.21680070459842682, | |
| "learning_rate": 0.00018259150807469797, | |
| "loss": 0.1158, | |
| "mean_token_accuracy": 0.9711233228445053, | |
| "num_tokens": 909881.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.2288, | |
| "grad_norm": 0.2424672693014145, | |
| "learning_rate": 0.00018231472591748082, | |
| "loss": 0.1345, | |
| "mean_token_accuracy": 0.970282718539238, | |
| "num_tokens": 916278.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 0.2153424471616745, | |
| "learning_rate": 0.00018203600027939926, | |
| "loss": 0.1338, | |
| "mean_token_accuracy": 0.9689566493034363, | |
| "num_tokens": 922692.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 0.21825724840164185, | |
| "learning_rate": 0.00018175533865129193, | |
| "loss": 0.1218, | |
| "mean_token_accuracy": 0.9702320545911789, | |
| "num_tokens": 929055.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 0.2290211170911789, | |
| "learning_rate": 0.0001814727485760278, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.9686111658811569, | |
| "num_tokens": 935470.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.2352, | |
| "grad_norm": 0.2108326554298401, | |
| "learning_rate": 0.00018118823764830356, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.9687074422836304, | |
| "num_tokens": 941947.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 0.18111184239387512, | |
| "learning_rate": 0.0001809018135144392, | |
| "loss": 0.1035, | |
| "mean_token_accuracy": 0.9740245640277863, | |
| "num_tokens": 948269.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.2384, | |
| "grad_norm": 0.2083301693201065, | |
| "learning_rate": 0.00018061348387217295, | |
| "loss": 0.1385, | |
| "mean_token_accuracy": 0.9670233130455017, | |
| "num_tokens": 954614.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.2112870216369629, | |
| "learning_rate": 0.00018032325647045403, | |
| "loss": 0.1315, | |
| "mean_token_accuracy": 0.9662119150161743, | |
| "num_tokens": 961106.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2416, | |
| "grad_norm": 0.18090932071208954, | |
| "learning_rate": 0.00018003113910923461, | |
| "loss": 0.1349, | |
| "mean_token_accuracy": 0.9668967127799988, | |
| "num_tokens": 967519.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 0.2193027287721634, | |
| "learning_rate": 0.00017973713963926008, | |
| "loss": 0.1383, | |
| "mean_token_accuracy": 0.967611238360405, | |
| "num_tokens": 973928.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.2448, | |
| "grad_norm": 0.228580504655838, | |
| "learning_rate": 0.0001794412659618581, | |
| "loss": 0.1392, | |
| "mean_token_accuracy": 0.9669467955827713, | |
| "num_tokens": 980339.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 0.21165654063224792, | |
| "learning_rate": 0.00017914352602872626, | |
| "loss": 0.126, | |
| "mean_token_accuracy": 0.969243586063385, | |
| "num_tokens": 986731.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 0.21300143003463745, | |
| "learning_rate": 0.00017884392784171831, | |
| "loss": 0.1172, | |
| "mean_token_accuracy": 0.9721066653728485, | |
| "num_tokens": 993114.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 0.20460815727710724, | |
| "learning_rate": 0.00017854247945262917, | |
| "loss": 0.1263, | |
| "mean_token_accuracy": 0.967908188700676, | |
| "num_tokens": 999518.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.2512, | |
| "grad_norm": 0.21017986536026, | |
| "learning_rate": 0.00017823918896297852, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.968003123998642, | |
| "num_tokens": 1005891.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 0.2038867175579071, | |
| "learning_rate": 0.00017793406452379314, | |
| "loss": 0.1153, | |
| "mean_token_accuracy": 0.9713983237743378, | |
| "num_tokens": 1012233.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.2544, | |
| "grad_norm": 0.23962758481502533, | |
| "learning_rate": 0.00017762711433538765, | |
| "loss": 0.1161, | |
| "mean_token_accuracy": 0.9698656499385834, | |
| "num_tokens": 1018606.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.22777049243450165, | |
| "learning_rate": 0.00017731834664714438, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.9689466655254364, | |
| "num_tokens": 1024989.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2576, | |
| "grad_norm": 0.22985287010669708, | |
| "learning_rate": 0.00017700776975729138, | |
| "loss": 0.1424, | |
| "mean_token_accuracy": 0.968091607093811, | |
| "num_tokens": 1031448.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 0.20628033578395844, | |
| "learning_rate": 0.00017669539201267974, | |
| "loss": 0.1203, | |
| "mean_token_accuracy": 0.9699797928333282, | |
| "num_tokens": 1037943.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.2608, | |
| "grad_norm": 0.230947807431221, | |
| "learning_rate": 0.0001763812218085589, | |
| "loss": 0.1144, | |
| "mean_token_accuracy": 0.9718270003795624, | |
| "num_tokens": 1044335.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 0.23480476438999176, | |
| "learning_rate": 0.00017606526758835145, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9661828726530075, | |
| "num_tokens": 1050858.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 0.286101758480072, | |
| "learning_rate": 0.00017574753784342566, | |
| "loss": 0.1576, | |
| "mean_token_accuracy": 0.9625564217567444, | |
| "num_tokens": 1057344.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 0.27905723452568054, | |
| "learning_rate": 0.00017542804111286785, | |
| "loss": 0.1187, | |
| "mean_token_accuracy": 0.9719154685735703, | |
| "num_tokens": 1063727.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.2672, | |
| "grad_norm": 0.23572972416877747, | |
| "learning_rate": 0.00017510678598325248, | |
| "loss": 0.1285, | |
| "mean_token_accuracy": 0.9678687751293182, | |
| "num_tokens": 1070183.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 0.2037675976753235, | |
| "learning_rate": 0.0001747837810884116, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9717495441436768, | |
| "num_tokens": 1076505.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.2704, | |
| "grad_norm": 0.2096836119890213, | |
| "learning_rate": 0.00017445903510920278, | |
| "loss": 0.1183, | |
| "mean_token_accuracy": 0.9706638306379318, | |
| "num_tokens": 1082788.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.19274486601352692, | |
| "learning_rate": 0.00017413255677327564, | |
| "loss": 0.1195, | |
| "mean_token_accuracy": 0.9717435389757156, | |
| "num_tokens": 1089139.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2736, | |
| "grad_norm": 0.21763110160827637, | |
| "learning_rate": 0.00017380435485483755, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.9673854559659958, | |
| "num_tokens": 1095559.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 0.21461325883865356, | |
| "learning_rate": 0.0001734744381744177, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.9674231857061386, | |
| "num_tokens": 1102009.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.2768, | |
| "grad_norm": 0.20319169759750366, | |
| "learning_rate": 0.0001731428155986299, | |
| "loss": 0.1132, | |
| "mean_token_accuracy": 0.9742031097412109, | |
| "num_tokens": 1108307.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 0.20769742131233215, | |
| "learning_rate": 0.00017280949603993466, | |
| "loss": 0.123, | |
| "mean_token_accuracy": 0.9704341739416122, | |
| "num_tokens": 1114645.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.223323255777359, | |
| "learning_rate": 0.00017247448845639926, | |
| "loss": 0.1564, | |
| "mean_token_accuracy": 0.9620358049869537, | |
| "num_tokens": 1121208.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 0.21843306720256805, | |
| "learning_rate": 0.00017213780185145722, | |
| "loss": 0.1172, | |
| "mean_token_accuracy": 0.9705088138580322, | |
| "num_tokens": 1127558.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.2832, | |
| "grad_norm": 0.21284207701683044, | |
| "learning_rate": 0.00017179944527366634, | |
| "loss": 0.118, | |
| "mean_token_accuracy": 0.9688057154417038, | |
| "num_tokens": 1133969.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 0.20217035710811615, | |
| "learning_rate": 0.00017145942781646533, | |
| "loss": 0.1293, | |
| "mean_token_accuracy": 0.9691077172756195, | |
| "num_tokens": 1140358.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.2864, | |
| "grad_norm": 0.23235690593719482, | |
| "learning_rate": 0.00017111775861792977, | |
| "loss": 0.1349, | |
| "mean_token_accuracy": 0.96702641248703, | |
| "num_tokens": 1146844.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.21814842522144318, | |
| "learning_rate": 0.00017077444686052607, | |
| "loss": 0.1254, | |
| "mean_token_accuracy": 0.9675975143909454, | |
| "num_tokens": 1153280.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2896, | |
| "grad_norm": 0.23143963515758514, | |
| "learning_rate": 0.00017042950177086503, | |
| "loss": 0.1286, | |
| "mean_token_accuracy": 0.9693543314933777, | |
| "num_tokens": 1159643.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 0.20468924939632416, | |
| "learning_rate": 0.00017008293261945383, | |
| "loss": 0.1034, | |
| "mean_token_accuracy": 0.9741992652416229, | |
| "num_tokens": 1165941.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.2928, | |
| "grad_norm": 0.18912264704704285, | |
| "learning_rate": 0.00016973474872044672, | |
| "loss": 0.1132, | |
| "mean_token_accuracy": 0.97054822742939, | |
| "num_tokens": 1172330.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 0.21152672171592712, | |
| "learning_rate": 0.0001693849594313948, | |
| "loss": 0.144, | |
| "mean_token_accuracy": 0.9662601351737976, | |
| "num_tokens": 1178699.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 0.19410429894924164, | |
| "learning_rate": 0.00016903357415299462, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9687875956296921, | |
| "num_tokens": 1185016.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 0.2133607715368271, | |
| "learning_rate": 0.00016868060232883532, | |
| "loss": 0.1224, | |
| "mean_token_accuracy": 0.966905266046524, | |
| "num_tokens": 1191360.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.2992, | |
| "grad_norm": 0.20015361905097961, | |
| "learning_rate": 0.000168326053445145, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9676015228033066, | |
| "num_tokens": 1197745.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 0.19227543473243713, | |
| "learning_rate": 0.0001679699370305358, | |
| "loss": 0.12, | |
| "mean_token_accuracy": 0.9702216982841492, | |
| "num_tokens": 1204142.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.3024, | |
| "grad_norm": 0.18203721940517426, | |
| "learning_rate": 0.00016761226265574766, | |
| "loss": 0.1117, | |
| "mean_token_accuracy": 0.9709526747465134, | |
| "num_tokens": 1210515.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.19697356224060059, | |
| "learning_rate": 0.00016725303993339122, | |
| "loss": 0.1349, | |
| "mean_token_accuracy": 0.9693506807088852, | |
| "num_tokens": 1216943.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3056, | |
| "grad_norm": 0.23757514357566833, | |
| "learning_rate": 0.00016689227851768938, | |
| "loss": 0.1427, | |
| "mean_token_accuracy": 0.9636243879795074, | |
| "num_tokens": 1223529.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 0.1874391883611679, | |
| "learning_rate": 0.00016652998810421805, | |
| "loss": 0.1261, | |
| "mean_token_accuracy": 0.9681402146816254, | |
| "num_tokens": 1229981.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.3088, | |
| "grad_norm": 0.20513875782489777, | |
| "learning_rate": 0.00016616617842964523, | |
| "loss": 0.126, | |
| "mean_token_accuracy": 0.967596098780632, | |
| "num_tokens": 1236474.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 0.1965560019016266, | |
| "learning_rate": 0.00016580085927146978, | |
| "loss": 0.125, | |
| "mean_token_accuracy": 0.9703619480133057, | |
| "num_tokens": 1242823.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 0.2100786417722702, | |
| "learning_rate": 0.00016543404044775815, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.9687833189964294, | |
| "num_tokens": 1249237.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 0.19941875338554382, | |
| "learning_rate": 0.000165065731816881, | |
| "loss": 0.1193, | |
| "mean_token_accuracy": 0.9695306867361069, | |
| "num_tokens": 1255587.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.3152, | |
| "grad_norm": 0.20697136223316193, | |
| "learning_rate": 0.00016469594327724786, | |
| "loss": 0.1164, | |
| "mean_token_accuracy": 0.9706899374723434, | |
| "num_tokens": 1262047.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 0.18834592401981354, | |
| "learning_rate": 0.00016432468476704134, | |
| "loss": 0.121, | |
| "mean_token_accuracy": 0.9711324870586395, | |
| "num_tokens": 1268447.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.3184, | |
| "grad_norm": 0.2166885882616043, | |
| "learning_rate": 0.00016395196626394995, | |
| "loss": 0.1438, | |
| "mean_token_accuracy": 0.9647317230701447, | |
| "num_tokens": 1274975.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.22583834826946259, | |
| "learning_rate": 0.00016357779778489995, | |
| "loss": 0.1254, | |
| "mean_token_accuracy": 0.9690139293670654, | |
| "num_tokens": 1281381.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3216, | |
| "grad_norm": 0.25999006628990173, | |
| "learning_rate": 0.00016320218938578623, | |
| "loss": 0.1234, | |
| "mean_token_accuracy": 0.9680958092212677, | |
| "num_tokens": 1287814.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 0.1921595185995102, | |
| "learning_rate": 0.00016282515116120187, | |
| "loss": 0.1191, | |
| "mean_token_accuracy": 0.9717408567667007, | |
| "num_tokens": 1294201.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.3248, | |
| "grad_norm": 0.2113265097141266, | |
| "learning_rate": 0.0001624466932441671, | |
| "loss": 0.12, | |
| "mean_token_accuracy": 0.9684627503156662, | |
| "num_tokens": 1300599.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 0.20990341901779175, | |
| "learning_rate": 0.00016206682580585668, | |
| "loss": 0.1164, | |
| "mean_token_accuracy": 0.9701545983552933, | |
| "num_tokens": 1307046.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 0.21143266558647156, | |
| "learning_rate": 0.00016168555905532675, | |
| "loss": 0.1178, | |
| "mean_token_accuracy": 0.9703093618154526, | |
| "num_tokens": 1313462.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 0.2028292715549469, | |
| "learning_rate": 0.00016130290323924045, | |
| "loss": 0.1214, | |
| "mean_token_accuracy": 0.9694353640079498, | |
| "num_tokens": 1319842.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.3312, | |
| "grad_norm": 0.19591575860977173, | |
| "learning_rate": 0.00016091886864159246, | |
| "loss": 0.1049, | |
| "mean_token_accuracy": 0.9734068661928177, | |
| "num_tokens": 1326116.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 0.2032654732465744, | |
| "learning_rate": 0.0001605334655834326, | |
| "loss": 0.1233, | |
| "mean_token_accuracy": 0.9697255343198776, | |
| "num_tokens": 1332497.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.3344, | |
| "grad_norm": 0.20048914849758148, | |
| "learning_rate": 0.00016014670442258857, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.9689445048570633, | |
| "num_tokens": 1338884.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.21282394230365753, | |
| "learning_rate": 0.00015975859555338745, | |
| "loss": 0.1309, | |
| "mean_token_accuracy": 0.968353345990181, | |
| "num_tokens": 1345308.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3376, | |
| "grad_norm": 0.2050577849149704, | |
| "learning_rate": 0.00015936914940637643, | |
| "loss": 0.1239, | |
| "mean_token_accuracy": 0.9703089147806168, | |
| "num_tokens": 1351607.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 0.209857776761055, | |
| "learning_rate": 0.00015897837644804247, | |
| "loss": 0.1362, | |
| "mean_token_accuracy": 0.9671057909727097, | |
| "num_tokens": 1358154.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.3408, | |
| "grad_norm": 0.23910588026046753, | |
| "learning_rate": 0.00015858628718053093, | |
| "loss": 0.1215, | |
| "mean_token_accuracy": 0.968706801533699, | |
| "num_tokens": 1364491.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 0.20729774236679077, | |
| "learning_rate": 0.0001581928921413635, | |
| "loss": 0.1187, | |
| "mean_token_accuracy": 0.9694890677928925, | |
| "num_tokens": 1370954.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 0.18330152332782745, | |
| "learning_rate": 0.00015779820190315477, | |
| "loss": 0.1176, | |
| "mean_token_accuracy": 0.9711231291294098, | |
| "num_tokens": 1377310.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 0.19051209092140198, | |
| "learning_rate": 0.0001574022270733283, | |
| "loss": 0.1123, | |
| "mean_token_accuracy": 0.9726870059967041, | |
| "num_tokens": 1383611.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.3472, | |
| "grad_norm": 0.19454888999462128, | |
| "learning_rate": 0.00015700497829383145, | |
| "loss": 0.1195, | |
| "mean_token_accuracy": 0.9696877151727676, | |
| "num_tokens": 1389925.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 0.19892489910125732, | |
| "learning_rate": 0.00015660646624084928, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.9690258502960205, | |
| "num_tokens": 1396297.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.3504, | |
| "grad_norm": 0.1927718073129654, | |
| "learning_rate": 0.00015620670162451786, | |
| "loss": 0.1161, | |
| "mean_token_accuracy": 0.9696942269802094, | |
| "num_tokens": 1402743.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.19402103126049042, | |
| "learning_rate": 0.0001558056951886362, | |
| "loss": 0.1249, | |
| "mean_token_accuracy": 0.9706161469221115, | |
| "num_tokens": 1409160.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3536, | |
| "grad_norm": 0.2437286376953125, | |
| "learning_rate": 0.00015540345771037758, | |
| "loss": 0.1247, | |
| "mean_token_accuracy": 0.969157263636589, | |
| "num_tokens": 1415562.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 0.22037462890148163, | |
| "learning_rate": 0.000155, | |
| "loss": 0.1309, | |
| "mean_token_accuracy": 0.967937096953392, | |
| "num_tokens": 1422055.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.3568, | |
| "grad_norm": 0.21038919687271118, | |
| "learning_rate": 0.00015459533290055556, | |
| "loss": 0.1451, | |
| "mean_token_accuracy": 0.9643835127353668, | |
| "num_tokens": 1428565.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 0.2613218128681183, | |
| "learning_rate": 0.000154189467287599, | |
| "loss": 0.1218, | |
| "mean_token_accuracy": 0.9695648550987244, | |
| "num_tokens": 1435010.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.2185756415128708, | |
| "learning_rate": 0.00015378241406889558, | |
| "loss": 0.1257, | |
| "mean_token_accuracy": 0.9673685133457184, | |
| "num_tokens": 1441396.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 0.18744446337223053, | |
| "learning_rate": 0.00015337418418412784, | |
| "loss": 0.115, | |
| "mean_token_accuracy": 0.9695663303136826, | |
| "num_tokens": 1447787.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.3632, | |
| "grad_norm": 0.20236626267433167, | |
| "learning_rate": 0.00015296478860460144, | |
| "loss": 0.1204, | |
| "mean_token_accuracy": 0.9680865108966827, | |
| "num_tokens": 1454208.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 0.24336816370487213, | |
| "learning_rate": 0.00015255423833295063, | |
| "loss": 0.115, | |
| "mean_token_accuracy": 0.9713991284370422, | |
| "num_tokens": 1460589.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.3664, | |
| "grad_norm": 0.20105664432048798, | |
| "learning_rate": 0.0001521425444028423, | |
| "loss": 0.1056, | |
| "mean_token_accuracy": 0.9720177948474884, | |
| "num_tokens": 1466932.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.20474457740783691, | |
| "learning_rate": 0.00015172971787867946, | |
| "loss": 0.1225, | |
| "mean_token_accuracy": 0.9683423638343811, | |
| "num_tokens": 1473391.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3696, | |
| "grad_norm": 0.22190451622009277, | |
| "learning_rate": 0.00015131576985530406, | |
| "loss": 0.1393, | |
| "mean_token_accuracy": 0.9674231708049774, | |
| "num_tokens": 1479784.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 0.20912696421146393, | |
| "learning_rate": 0.00015090071145769856, | |
| "loss": 0.1227, | |
| "mean_token_accuracy": 0.9675348401069641, | |
| "num_tokens": 1486192.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.3728, | |
| "grad_norm": 0.18880720436573029, | |
| "learning_rate": 0.00015048455384068725, | |
| "loss": 0.1055, | |
| "mean_token_accuracy": 0.9715090990066528, | |
| "num_tokens": 1492517.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 0.2224239706993103, | |
| "learning_rate": 0.00015006730818863603, | |
| "loss": 0.1265, | |
| "mean_token_accuracy": 0.9706821441650391, | |
| "num_tokens": 1498857.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 0.22639583051204681, | |
| "learning_rate": 0.00014964898571515235, | |
| "loss": 0.1247, | |
| "mean_token_accuracy": 0.9685398042201996, | |
| "num_tokens": 1505221.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 0.21840308606624603, | |
| "learning_rate": 0.0001492295976627834, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9672094136476517, | |
| "num_tokens": 1511678.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.3792, | |
| "grad_norm": 0.21276673674583435, | |
| "learning_rate": 0.00014880915530271417, | |
| "loss": 0.1042, | |
| "mean_token_accuracy": 0.9731869846582413, | |
| "num_tokens": 1518029.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 0.18728865683078766, | |
| "learning_rate": 0.0001483876699344646, | |
| "loss": 0.1089, | |
| "mean_token_accuracy": 0.9733744710683823, | |
| "num_tokens": 1524349.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.3824, | |
| "grad_norm": 0.22521647810935974, | |
| "learning_rate": 0.0001479651528855856, | |
| "loss": 0.1127, | |
| "mean_token_accuracy": 0.9685888588428497, | |
| "num_tokens": 1530848.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.23462074995040894, | |
| "learning_rate": 0.00014754161551135505, | |
| "loss": 0.1575, | |
| "mean_token_accuracy": 0.96436907351017, | |
| "num_tokens": 1537321.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3856, | |
| "grad_norm": 0.20503735542297363, | |
| "learning_rate": 0.00014711706919447217, | |
| "loss": 0.1213, | |
| "mean_token_accuracy": 0.9680697917938232, | |
| "num_tokens": 1543692.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 0.20140637457370758, | |
| "learning_rate": 0.000146691525344752, | |
| "loss": 0.1094, | |
| "mean_token_accuracy": 0.9715017676353455, | |
| "num_tokens": 1550094.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.3888, | |
| "grad_norm": 0.19523519277572632, | |
| "learning_rate": 0.00014626499539881853, | |
| "loss": 0.1133, | |
| "mean_token_accuracy": 0.972250685095787, | |
| "num_tokens": 1556558.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 0.19837141036987305, | |
| "learning_rate": 0.00014583749081979738, | |
| "loss": 0.1245, | |
| "mean_token_accuracy": 0.9700156152248383, | |
| "num_tokens": 1563030.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 0.2056553065776825, | |
| "learning_rate": 0.00014540902309700772, | |
| "loss": 0.1199, | |
| "mean_token_accuracy": 0.9720380306243896, | |
| "num_tokens": 1569374.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 0.21602964401245117, | |
| "learning_rate": 0.0001449796037456536, | |
| "loss": 0.1058, | |
| "mean_token_accuracy": 0.9709930568933487, | |
| "num_tokens": 1575735.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.3952, | |
| "grad_norm": 0.20429149270057678, | |
| "learning_rate": 0.00014454924430651423, | |
| "loss": 0.1268, | |
| "mean_token_accuracy": 0.9682994782924652, | |
| "num_tokens": 1582184.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 0.1919829547405243, | |
| "learning_rate": 0.00014411795634563417, | |
| "loss": 0.1177, | |
| "mean_token_accuracy": 0.9713652729988098, | |
| "num_tokens": 1588624.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.3984, | |
| "grad_norm": 0.19841861724853516, | |
| "learning_rate": 0.00014368575145401208, | |
| "loss": 0.1209, | |
| "mean_token_accuracy": 0.9704104363918304, | |
| "num_tokens": 1595061.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.21545560657978058, | |
| "learning_rate": 0.00014325264124728966, | |
| "loss": 0.1162, | |
| "mean_token_accuracy": 0.9697667211294174, | |
| "num_tokens": 1601520.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4016, | |
| "grad_norm": 0.20815452933311462, | |
| "learning_rate": 0.00014281863736543898, | |
| "loss": 0.1202, | |
| "mean_token_accuracy": 0.9678974449634552, | |
| "num_tokens": 1607981.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 0.21342110633850098, | |
| "learning_rate": 0.0001423837514724501, | |
| "loss": 0.1229, | |
| "mean_token_accuracy": 0.9697300642728806, | |
| "num_tokens": 1614340.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.4048, | |
| "grad_norm": 0.20468273758888245, | |
| "learning_rate": 0.0001419479952560173, | |
| "loss": 0.1212, | |
| "mean_token_accuracy": 0.968998059630394, | |
| "num_tokens": 1620766.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 0.21428808569908142, | |
| "learning_rate": 0.00014151138042722516, | |
| "loss": 0.1303, | |
| "mean_token_accuracy": 0.9673433899879456, | |
| "num_tokens": 1627173.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 0.18869782984256744, | |
| "learning_rate": 0.00014107391872023367, | |
| "loss": 0.1231, | |
| "mean_token_accuracy": 0.9711828827857971, | |
| "num_tokens": 1633594.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 0.19752350449562073, | |
| "learning_rate": 0.00014063562189196296, | |
| "loss": 0.1224, | |
| "mean_token_accuracy": 0.9714065790176392, | |
| "num_tokens": 1639906.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.4112, | |
| "grad_norm": 0.21103455126285553, | |
| "learning_rate": 0.00014019650172177725, | |
| "loss": 0.1119, | |
| "mean_token_accuracy": 0.9714159965515137, | |
| "num_tokens": 1646318.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 0.20981501042842865, | |
| "learning_rate": 0.00013975657001116844, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9680128693580627, | |
| "num_tokens": 1652800.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.4144, | |
| "grad_norm": 0.21067166328430176, | |
| "learning_rate": 0.00013931583858343876, | |
| "loss": 0.1257, | |
| "mean_token_accuracy": 0.9674056023359299, | |
| "num_tokens": 1659234.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.214093416929245, | |
| "learning_rate": 0.00013887431928338308, | |
| "loss": 0.1212, | |
| "mean_token_accuracy": 0.9694658070802689, | |
| "num_tokens": 1665664.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4176, | |
| "grad_norm": 0.20206278562545776, | |
| "learning_rate": 0.00013843202397697066, | |
| "loss": 0.1223, | |
| "mean_token_accuracy": 0.969369575381279, | |
| "num_tokens": 1672110.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 0.2278628796339035, | |
| "learning_rate": 0.00013798896455102607, | |
| "loss": 0.1125, | |
| "mean_token_accuracy": 0.9708160609006882, | |
| "num_tokens": 1678493.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.4208, | |
| "grad_norm": 0.19317105412483215, | |
| "learning_rate": 0.00013754515291290989, | |
| "loss": 0.1161, | |
| "mean_token_accuracy": 0.9721627086400986, | |
| "num_tokens": 1684864.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 0.2055988907814026, | |
| "learning_rate": 0.0001371006009901986, | |
| "loss": 0.1164, | |
| "mean_token_accuracy": 0.9715331196784973, | |
| "num_tokens": 1691311.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 0.2198437750339508, | |
| "learning_rate": 0.00013665532073036415, | |
| "loss": 0.1164, | |
| "mean_token_accuracy": 0.9691477119922638, | |
| "num_tokens": 1697650.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 0.20301388204097748, | |
| "learning_rate": 0.0001362093241004527, | |
| "loss": 0.1221, | |
| "mean_token_accuracy": 0.9719629138708115, | |
| "num_tokens": 1704085.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.4272, | |
| "grad_norm": 0.22491414844989777, | |
| "learning_rate": 0.00013576262308676309, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9683026969432831, | |
| "num_tokens": 1710578.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 0.2095235288143158, | |
| "learning_rate": 0.00013531522969452466, | |
| "loss": 0.1217, | |
| "mean_token_accuracy": 0.9709521383047104, | |
| "num_tokens": 1716946.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.4304, | |
| "grad_norm": 0.20362724363803864, | |
| "learning_rate": 0.00013486715594757473, | |
| "loss": 0.1225, | |
| "mean_token_accuracy": 0.9720054864883423, | |
| "num_tokens": 1723391.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.23961152136325836, | |
| "learning_rate": 0.0001344184138880353, | |
| "loss": 0.1224, | |
| "mean_token_accuracy": 0.9688589721918106, | |
| "num_tokens": 1729838.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4336, | |
| "grad_norm": 0.20943373441696167, | |
| "learning_rate": 0.0001339690155759895, | |
| "loss": 0.1171, | |
| "mean_token_accuracy": 0.9713444262742996, | |
| "num_tokens": 1736188.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 0.25839704275131226, | |
| "learning_rate": 0.00013351897308915746, | |
| "loss": 0.1371, | |
| "mean_token_accuracy": 0.9661355465650558, | |
| "num_tokens": 1742664.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.4368, | |
| "grad_norm": 0.20215949416160583, | |
| "learning_rate": 0.00013306829852257167, | |
| "loss": 0.1172, | |
| "mean_token_accuracy": 0.9709440320730209, | |
| "num_tokens": 1749078.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 0.20678719878196716, | |
| "learning_rate": 0.000132617003988252, | |
| "loss": 0.1188, | |
| "mean_token_accuracy": 0.9701162576675415, | |
| "num_tokens": 1755549.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.21287719905376434, | |
| "learning_rate": 0.00013216510161488014, | |
| "loss": 0.12, | |
| "mean_token_accuracy": 0.9678113758563995, | |
| "num_tokens": 1761994.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 0.21827968955039978, | |
| "learning_rate": 0.00013171260354747358, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9712927043437958, | |
| "num_tokens": 1768425.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.4432, | |
| "grad_norm": 0.2035861313343048, | |
| "learning_rate": 0.00013125952194705944, | |
| "loss": 0.0983, | |
| "mean_token_accuracy": 0.9732427597045898, | |
| "num_tokens": 1774745.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 0.19649146497249603, | |
| "learning_rate": 0.0001308058689903473, | |
| "loss": 0.1073, | |
| "mean_token_accuracy": 0.9727627336978912, | |
| "num_tokens": 1781179.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.4464, | |
| "grad_norm": 0.22073034942150116, | |
| "learning_rate": 0.00013035165686940212, | |
| "loss": 0.1267, | |
| "mean_token_accuracy": 0.9675616472959518, | |
| "num_tokens": 1787669.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.2733401656150818, | |
| "learning_rate": 0.00012989689779131677, | |
| "loss": 0.1332, | |
| "mean_token_accuracy": 0.9678397178649902, | |
| "num_tokens": 1794206.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4496, | |
| "grad_norm": 0.2550334930419922, | |
| "learning_rate": 0.00012944160397788354, | |
| "loss": 0.1084, | |
| "mean_token_accuracy": 0.9733265787363052, | |
| "num_tokens": 1800552.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 0.21747393906116486, | |
| "learning_rate": 0.000128985787665266, | |
| "loss": 0.1093, | |
| "mean_token_accuracy": 0.9712890237569809, | |
| "num_tokens": 1806942.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.4528, | |
| "grad_norm": 0.22573259472846985, | |
| "learning_rate": 0.00012852946110367006, | |
| "loss": 0.1282, | |
| "mean_token_accuracy": 0.9677459895610809, | |
| "num_tokens": 1813372.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 0.20909075438976288, | |
| "learning_rate": 0.00012807263655701466, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9668598026037216, | |
| "num_tokens": 1819984.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 0.19675710797309875, | |
| "learning_rate": 0.00012761532630260237, | |
| "loss": 0.1134, | |
| "mean_token_accuracy": 0.9703297019004822, | |
| "num_tokens": 1826393.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 0.18973907828330994, | |
| "learning_rate": 0.0001271575426307892, | |
| "loss": 0.1089, | |
| "mean_token_accuracy": 0.9732929468154907, | |
| "num_tokens": 1832795.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.4592, | |
| "grad_norm": 0.21302960813045502, | |
| "learning_rate": 0.00012669929784465443, | |
| "loss": 0.1191, | |
| "mean_token_accuracy": 0.9711341261863708, | |
| "num_tokens": 1839119.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 0.2029804140329361, | |
| "learning_rate": 0.00012624060425966985, | |
| "loss": 0.1157, | |
| "mean_token_accuracy": 0.9701730608940125, | |
| "num_tokens": 1845583.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.4624, | |
| "grad_norm": 0.19550254940986633, | |
| "learning_rate": 0.0001257814742033691, | |
| "loss": 0.1046, | |
| "mean_token_accuracy": 0.9721900969743729, | |
| "num_tokens": 1851989.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.23089411854743958, | |
| "learning_rate": 0.00012532192001501587, | |
| "loss": 0.1084, | |
| "mean_token_accuracy": 0.9695534259080887, | |
| "num_tokens": 1858348.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4656, | |
| "grad_norm": 0.23901835083961487, | |
| "learning_rate": 0.00012486195404527264, | |
| "loss": 0.1206, | |
| "mean_token_accuracy": 0.9688645452260971, | |
| "num_tokens": 1864712.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 0.22964279353618622, | |
| "learning_rate": 0.00012440158865586868, | |
| "loss": 0.1143, | |
| "mean_token_accuracy": 0.9722412526607513, | |
| "num_tokens": 1871071.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.4688, | |
| "grad_norm": 0.20730289816856384, | |
| "learning_rate": 0.00012394083621926764, | |
| "loss": 0.112, | |
| "mean_token_accuracy": 0.9713632613420486, | |
| "num_tokens": 1877510.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 0.2335960566997528, | |
| "learning_rate": 0.00012347970911833536, | |
| "loss": 0.1153, | |
| "mean_token_accuracy": 0.9718139171600342, | |
| "num_tokens": 1883885.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 0.21765975654125214, | |
| "learning_rate": 0.00012301821974600678, | |
| "loss": 0.1135, | |
| "mean_token_accuracy": 0.9720913171768188, | |
| "num_tokens": 1890304.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 0.27499356865882874, | |
| "learning_rate": 0.00012255638050495308, | |
| "loss": 0.1188, | |
| "mean_token_accuracy": 0.9693499654531479, | |
| "num_tokens": 1896774.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.4752, | |
| "grad_norm": 0.20668935775756836, | |
| "learning_rate": 0.00012209420380724823, | |
| "loss": 0.1212, | |
| "mean_token_accuracy": 0.970684677362442, | |
| "num_tokens": 1903168.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 0.2238469421863556, | |
| "learning_rate": 0.0001216317020740354, | |
| "loss": 0.1334, | |
| "mean_token_accuracy": 0.9677727222442627, | |
| "num_tokens": 1909636.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.4784, | |
| "grad_norm": 0.21707569062709808, | |
| "learning_rate": 0.00012116888773519334, | |
| "loss": 0.1193, | |
| "mean_token_accuracy": 0.9698205441236496, | |
| "num_tokens": 1916073.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.20731143653392792, | |
| "learning_rate": 0.00012070577322900203, | |
| "loss": 0.1285, | |
| "mean_token_accuracy": 0.9685927629470825, | |
| "num_tokens": 1922568.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4816, | |
| "grad_norm": 0.22840896248817444, | |
| "learning_rate": 0.0001202423710018086, | |
| "loss": 0.1283, | |
| "mean_token_accuracy": 0.9675214439630508, | |
| "num_tokens": 1929042.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 0.20883047580718994, | |
| "learning_rate": 0.00011977869350769271, | |
| "loss": 0.1293, | |
| "mean_token_accuracy": 0.9701283574104309, | |
| "num_tokens": 1935436.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.4848, | |
| "grad_norm": 0.23458795249462128, | |
| "learning_rate": 0.00011931475320813203, | |
| "loss": 0.1244, | |
| "mean_token_accuracy": 0.9691993445158005, | |
| "num_tokens": 1941696.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 0.22013331949710846, | |
| "learning_rate": 0.00011885056257166714, | |
| "loss": 0.1221, | |
| "mean_token_accuracy": 0.9705575555562973, | |
| "num_tokens": 1948020.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 0.23760345578193665, | |
| "learning_rate": 0.00011838613407356647, | |
| "loss": 0.1177, | |
| "mean_token_accuracy": 0.9704036563634872, | |
| "num_tokens": 1954355.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 0.21637286245822906, | |
| "learning_rate": 0.00011792148019549108, | |
| "loss": 0.119, | |
| "mean_token_accuracy": 0.9701642245054245, | |
| "num_tokens": 1960808.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.4912, | |
| "grad_norm": 0.1895596832036972, | |
| "learning_rate": 0.00011745661342515917, | |
| "loss": 0.1147, | |
| "mean_token_accuracy": 0.9714570045471191, | |
| "num_tokens": 1967292.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 0.19408081471920013, | |
| "learning_rate": 0.00011699154625601059, | |
| "loss": 0.1269, | |
| "mean_token_accuracy": 0.967630922794342, | |
| "num_tokens": 1973631.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.4944, | |
| "grad_norm": 0.19877780973911285, | |
| "learning_rate": 0.00011652629118687081, | |
| "loss": 0.1138, | |
| "mean_token_accuracy": 0.9719504117965698, | |
| "num_tokens": 1980093.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.1994163691997528, | |
| "learning_rate": 0.00011606086072161529, | |
| "loss": 0.1142, | |
| "mean_token_accuracy": 0.9715113788843155, | |
| "num_tokens": 1986565.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4976, | |
| "grad_norm": 0.2109498828649521, | |
| "learning_rate": 0.00011559526736883326, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9705955386161804, | |
| "num_tokens": 1992946.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 0.26642775535583496, | |
| "learning_rate": 0.00011512952364149163, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9658444672822952, | |
| "num_tokens": 1999549.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.5008, | |
| "grad_norm": 0.19585363566875458, | |
| "learning_rate": 0.00011466364205659868, | |
| "loss": 0.1092, | |
| "mean_token_accuracy": 0.9738690406084061, | |
| "num_tokens": 2005839.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 0.21523182094097137, | |
| "learning_rate": 0.00011419763513486758, | |
| "loss": 0.1188, | |
| "mean_token_accuracy": 0.9695539325475693, | |
| "num_tokens": 2012224.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 0.20653575658798218, | |
| "learning_rate": 0.0001137315154003801, | |
| "loss": 0.109, | |
| "mean_token_accuracy": 0.9696223735809326, | |
| "num_tokens": 2018603.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 0.22637856006622314, | |
| "learning_rate": 0.00011326529538024973, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.9652549624443054, | |
| "num_tokens": 2025037.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.5072, | |
| "grad_norm": 0.2478281557559967, | |
| "learning_rate": 0.00011279898760428534, | |
| "loss": 0.1157, | |
| "mean_token_accuracy": 0.9694862067699432, | |
| "num_tokens": 2031407.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 0.21094129979610443, | |
| "learning_rate": 0.0001123326046046541, | |
| "loss": 0.1206, | |
| "mean_token_accuracy": 0.9682268351316452, | |
| "num_tokens": 2037825.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.5104, | |
| "grad_norm": 0.2196258008480072, | |
| "learning_rate": 0.00011186615891554498, | |
| "loss": 0.1245, | |
| "mean_token_accuracy": 0.9698169678449631, | |
| "num_tokens": 2044227.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.20592908561229706, | |
| "learning_rate": 0.00011139966307283161, | |
| "loss": 0.127, | |
| "mean_token_accuracy": 0.9721533507108688, | |
| "num_tokens": 2050654.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5136, | |
| "grad_norm": 0.2053917944431305, | |
| "learning_rate": 0.00011093312961373561, | |
| "loss": 0.1138, | |
| "mean_token_accuracy": 0.9706659913063049, | |
| "num_tokens": 2057003.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 0.22749063372612, | |
| "learning_rate": 0.00011046657107648959, | |
| "loss": 0.1363, | |
| "mean_token_accuracy": 0.9667005389928818, | |
| "num_tokens": 2063373.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.5168, | |
| "grad_norm": 0.2216198742389679, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 0.1073, | |
| "mean_token_accuracy": 0.9733321070671082, | |
| "num_tokens": 2069825.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 0.2153901904821396, | |
| "learning_rate": 0.00010953342892351046, | |
| "loss": 0.1092, | |
| "mean_token_accuracy": 0.9731239527463913, | |
| "num_tokens": 2076202.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.19987858831882477, | |
| "learning_rate": 0.0001090668703862644, | |
| "loss": 0.1037, | |
| "mean_token_accuracy": 0.9744304716587067, | |
| "num_tokens": 2082475.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 0.2075393795967102, | |
| "learning_rate": 0.00010860033692716843, | |
| "loss": 0.1052, | |
| "mean_token_accuracy": 0.9723316729068756, | |
| "num_tokens": 2088866.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.5232, | |
| "grad_norm": 0.21882832050323486, | |
| "learning_rate": 0.0001081338410844551, | |
| "loss": 0.1273, | |
| "mean_token_accuracy": 0.9703366309404373, | |
| "num_tokens": 2095314.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 0.19629748165607452, | |
| "learning_rate": 0.00010766739539534591, | |
| "loss": 0.1039, | |
| "mean_token_accuracy": 0.9732581228017807, | |
| "num_tokens": 2101677.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.5264, | |
| "grad_norm": 0.22761981189250946, | |
| "learning_rate": 0.0001072010123957147, | |
| "loss": 0.1065, | |
| "mean_token_accuracy": 0.9731667041778564, | |
| "num_tokens": 2108149.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.23567941784858704, | |
| "learning_rate": 0.00010673470461975028, | |
| "loss": 0.0996, | |
| "mean_token_accuracy": 0.9738435000181198, | |
| "num_tokens": 2114510.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5296, | |
| "grad_norm": 0.24491772055625916, | |
| "learning_rate": 0.00010626848459961993, | |
| "loss": 0.1168, | |
| "mean_token_accuracy": 0.969748243689537, | |
| "num_tokens": 2121038.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 0.2350424975156784, | |
| "learning_rate": 0.00010580236486513244, | |
| "loss": 0.1168, | |
| "mean_token_accuracy": 0.9712713807821274, | |
| "num_tokens": 2127437.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.5328, | |
| "grad_norm": 0.22201870381832123, | |
| "learning_rate": 0.00010533635794340136, | |
| "loss": 0.1087, | |
| "mean_token_accuracy": 0.9725343734025955, | |
| "num_tokens": 2133827.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 0.22964061796665192, | |
| "learning_rate": 0.00010487047635850837, | |
| "loss": 0.1202, | |
| "mean_token_accuracy": 0.9681902378797531, | |
| "num_tokens": 2140249.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 0.22518664598464966, | |
| "learning_rate": 0.00010440473263116676, | |
| "loss": 0.1126, | |
| "mean_token_accuracy": 0.9707356095314026, | |
| "num_tokens": 2146613.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 0.20553700625896454, | |
| "learning_rate": 0.00010393913927838475, | |
| "loss": 0.1171, | |
| "mean_token_accuracy": 0.9688108265399933, | |
| "num_tokens": 2152972.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.5392, | |
| "grad_norm": 0.22132201492786407, | |
| "learning_rate": 0.00010347370881312926, | |
| "loss": 0.1195, | |
| "mean_token_accuracy": 0.9701298028230667, | |
| "num_tokens": 2159502.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 0.21515999734401703, | |
| "learning_rate": 0.00010300845374398944, | |
| "loss": 0.1195, | |
| "mean_token_accuracy": 0.9729430824518204, | |
| "num_tokens": 2165850.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.5424, | |
| "grad_norm": 0.24679367244243622, | |
| "learning_rate": 0.00010254338657484086, | |
| "loss": 0.1069, | |
| "mean_token_accuracy": 0.970226839184761, | |
| "num_tokens": 2172244.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.23021413385868073, | |
| "learning_rate": 0.00010207851980450897, | |
| "loss": 0.1141, | |
| "mean_token_accuracy": 0.972098246216774, | |
| "num_tokens": 2178636.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5456, | |
| "grad_norm": 0.2051527202129364, | |
| "learning_rate": 0.00010161386592643356, | |
| "loss": 0.0962, | |
| "mean_token_accuracy": 0.972399890422821, | |
| "num_tokens": 2185014.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 0.212401881814003, | |
| "learning_rate": 0.0001011494374283329, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9700024873018265, | |
| "num_tokens": 2191518.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.5488, | |
| "grad_norm": 0.263895183801651, | |
| "learning_rate": 0.00010068524679186799, | |
| "loss": 0.1303, | |
| "mean_token_accuracy": 0.9654296636581421, | |
| "num_tokens": 2198031.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 0.21628491580486298, | |
| "learning_rate": 0.0001002213064923073, | |
| "loss": 0.1103, | |
| "mean_token_accuracy": 0.969383955001831, | |
| "num_tokens": 2204527.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 0.21267899870872498, | |
| "learning_rate": 9.975762899819144e-05, | |
| "loss": 0.105, | |
| "mean_token_accuracy": 0.9723921567201614, | |
| "num_tokens": 2210943.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 0.22340619564056396, | |
| "learning_rate": 9.929422677099802e-05, | |
| "loss": 0.1278, | |
| "mean_token_accuracy": 0.9678090810775757, | |
| "num_tokens": 2217343.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.5552, | |
| "grad_norm": 0.21315431594848633, | |
| "learning_rate": 9.883111226480665e-05, | |
| "loss": 0.1118, | |
| "mean_token_accuracy": 0.9715865701436996, | |
| "num_tokens": 2223745.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 0.19389605522155762, | |
| "learning_rate": 9.83682979259646e-05, | |
| "loss": 0.1094, | |
| "mean_token_accuracy": 0.9740034937858582, | |
| "num_tokens": 2230169.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.5584, | |
| "grad_norm": 0.18304763734340668, | |
| "learning_rate": 9.790579619275182e-05, | |
| "loss": 0.0996, | |
| "mean_token_accuracy": 0.9754536598920822, | |
| "num_tokens": 2236459.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.21581387519836426, | |
| "learning_rate": 9.744361949504694e-05, | |
| "loss": 0.1129, | |
| "mean_token_accuracy": 0.9693534970283508, | |
| "num_tokens": 2242893.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5616, | |
| "grad_norm": 0.233414426445961, | |
| "learning_rate": 9.698178025399325e-05, | |
| "loss": 0.1126, | |
| "mean_token_accuracy": 0.9706227630376816, | |
| "num_tokens": 2249305.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 0.2554890215396881, | |
| "learning_rate": 9.652029088166468e-05, | |
| "loss": 0.116, | |
| "mean_token_accuracy": 0.9684394896030426, | |
| "num_tokens": 2255818.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.5648, | |
| "grad_norm": 0.21633103489875793, | |
| "learning_rate": 9.605916378073238e-05, | |
| "loss": 0.1282, | |
| "mean_token_accuracy": 0.9679883718490601, | |
| "num_tokens": 2262328.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 0.21234236657619476, | |
| "learning_rate": 9.559841134413137e-05, | |
| "loss": 0.1116, | |
| "mean_token_accuracy": 0.9704491943120956, | |
| "num_tokens": 2268735.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 0.20229332149028778, | |
| "learning_rate": 9.513804595472739e-05, | |
| "loss": 0.1064, | |
| "mean_token_accuracy": 0.9744052588939667, | |
| "num_tokens": 2275088.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 0.2356174886226654, | |
| "learning_rate": 9.467807998498412e-05, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.9663089960813522, | |
| "num_tokens": 2281565.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.5712, | |
| "grad_norm": 0.23685023188591003, | |
| "learning_rate": 9.421852579663091e-05, | |
| "loss": 0.1273, | |
| "mean_token_accuracy": 0.9699599295854568, | |
| "num_tokens": 2288071.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 0.21520361304283142, | |
| "learning_rate": 9.375939574033015e-05, | |
| "loss": 0.1075, | |
| "mean_token_accuracy": 0.9720684885978699, | |
| "num_tokens": 2294491.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.5744, | |
| "grad_norm": 0.23035873472690582, | |
| "learning_rate": 9.33007021553456e-05, | |
| "loss": 0.1153, | |
| "mean_token_accuracy": 0.970378652215004, | |
| "num_tokens": 2300951.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.254526287317276, | |
| "learning_rate": 9.284245736921084e-05, | |
| "loss": 0.1197, | |
| "mean_token_accuracy": 0.9711134880781174, | |
| "num_tokens": 2307360.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5776, | |
| "grad_norm": 0.2077234834432602, | |
| "learning_rate": 9.238467369739765e-05, | |
| "loss": 0.1154, | |
| "mean_token_accuracy": 0.9706644117832184, | |
| "num_tokens": 2313749.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 0.2178497016429901, | |
| "learning_rate": 9.192736344298536e-05, | |
| "loss": 0.1286, | |
| "mean_token_accuracy": 0.96926149725914, | |
| "num_tokens": 2320166.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.5808, | |
| "grad_norm": 0.21862684190273285, | |
| "learning_rate": 9.147053889632998e-05, | |
| "loss": 0.1216, | |
| "mean_token_accuracy": 0.9692836999893188, | |
| "num_tokens": 2326626.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 0.20328934490680695, | |
| "learning_rate": 9.101421233473404e-05, | |
| "loss": 0.1084, | |
| "mean_token_accuracy": 0.9719917923212051, | |
| "num_tokens": 2333067.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 0.23007267713546753, | |
| "learning_rate": 9.05583960221165e-05, | |
| "loss": 0.113, | |
| "mean_token_accuracy": 0.9699399918317795, | |
| "num_tokens": 2339468.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 0.23424407839775085, | |
| "learning_rate": 9.010310220868325e-05, | |
| "loss": 0.1126, | |
| "mean_token_accuracy": 0.9716910868883133, | |
| "num_tokens": 2345912.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.5872, | |
| "grad_norm": 0.23417724668979645, | |
| "learning_rate": 8.964834313059789e-05, | |
| "loss": 0.1185, | |
| "mean_token_accuracy": 0.9715997725725174, | |
| "num_tokens": 2352359.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 0.23002183437347412, | |
| "learning_rate": 8.919413100965277e-05, | |
| "loss": 0.107, | |
| "mean_token_accuracy": 0.9706885367631912, | |
| "num_tokens": 2358754.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.5904, | |
| "grad_norm": 0.23188148438930511, | |
| "learning_rate": 8.874047805294058e-05, | |
| "loss": 0.1263, | |
| "mean_token_accuracy": 0.9674692898988724, | |
| "num_tokens": 2365383.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.21931526064872742, | |
| "learning_rate": 8.828739645252641e-05, | |
| "loss": 0.104, | |
| "mean_token_accuracy": 0.9727428257465363, | |
| "num_tokens": 2371871.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5936, | |
| "grad_norm": 0.20687925815582275, | |
| "learning_rate": 8.783489838511989e-05, | |
| "loss": 0.133, | |
| "mean_token_accuracy": 0.9656850099563599, | |
| "num_tokens": 2378357.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 0.2006218433380127, | |
| "learning_rate": 8.738299601174802e-05, | |
| "loss": 0.1056, | |
| "mean_token_accuracy": 0.972909688949585, | |
| "num_tokens": 2384758.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.5968, | |
| "grad_norm": 0.22540447115898132, | |
| "learning_rate": 8.693170147742835e-05, | |
| "loss": 0.1088, | |
| "mean_token_accuracy": 0.9730487316846848, | |
| "num_tokens": 2391150.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 0.20852358639240265, | |
| "learning_rate": 8.648102691084256e-05, | |
| "loss": 0.0997, | |
| "mean_token_accuracy": 0.9727807939052582, | |
| "num_tokens": 2397555.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.21884866058826447, | |
| "learning_rate": 8.603098442401049e-05, | |
| "loss": 0.1273, | |
| "mean_token_accuracy": 0.9718177616596222, | |
| "num_tokens": 2403922.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 0.24659469723701477, | |
| "learning_rate": 8.558158611196471e-05, | |
| "loss": 0.1116, | |
| "mean_token_accuracy": 0.9711674451828003, | |
| "num_tokens": 2410307.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.6032, | |
| "grad_norm": 0.21716761589050293, | |
| "learning_rate": 8.51328440524253e-05, | |
| "loss": 0.1164, | |
| "mean_token_accuracy": 0.9698936939239502, | |
| "num_tokens": 2416666.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 0.21534349024295807, | |
| "learning_rate": 8.468477030547538e-05, | |
| "loss": 0.1208, | |
| "mean_token_accuracy": 0.9710631966590881, | |
| "num_tokens": 2423130.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.6064, | |
| "grad_norm": 0.2345367819070816, | |
| "learning_rate": 8.423737691323696e-05, | |
| "loss": 0.1012, | |
| "mean_token_accuracy": 0.971062421798706, | |
| "num_tokens": 2429522.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.2316274642944336, | |
| "learning_rate": 8.379067589954735e-05, | |
| "loss": 0.1261, | |
| "mean_token_accuracy": 0.9678024053573608, | |
| "num_tokens": 2435994.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6096, | |
| "grad_norm": 0.206826850771904, | |
| "learning_rate": 8.334467926963585e-05, | |
| "loss": 0.1164, | |
| "mean_token_accuracy": 0.9720061868429184, | |
| "num_tokens": 2442395.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 0.2266598641872406, | |
| "learning_rate": 8.289939900980142e-05, | |
| "loss": 0.0926, | |
| "mean_token_accuracy": 0.9766720235347748, | |
| "num_tokens": 2448703.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.6128, | |
| "grad_norm": 0.21031378209590912, | |
| "learning_rate": 8.245484708709015e-05, | |
| "loss": 0.1071, | |
| "mean_token_accuracy": 0.9754838347434998, | |
| "num_tokens": 2455042.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 0.21050748229026794, | |
| "learning_rate": 8.201103544897395e-05, | |
| "loss": 0.1004, | |
| "mean_token_accuracy": 0.9737739711999893, | |
| "num_tokens": 2461534.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 0.23862861096858978, | |
| "learning_rate": 8.156797602302935e-05, | |
| "loss": 0.1038, | |
| "mean_token_accuracy": 0.9719929099082947, | |
| "num_tokens": 2467872.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 0.22231410443782806, | |
| "learning_rate": 8.112568071661692e-05, | |
| "loss": 0.1111, | |
| "mean_token_accuracy": 0.9720417261123657, | |
| "num_tokens": 2474214.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.6192, | |
| "grad_norm": 0.23811957240104675, | |
| "learning_rate": 8.068416141656127e-05, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.9699693024158478, | |
| "num_tokens": 2480690.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 0.24656380712985992, | |
| "learning_rate": 8.024342998883157e-05, | |
| "loss": 0.1167, | |
| "mean_token_accuracy": 0.9697620272636414, | |
| "num_tokens": 2487148.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.6224, | |
| "grad_norm": 0.21807914972305298, | |
| "learning_rate": 7.980349827822275e-05, | |
| "loss": 0.1065, | |
| "mean_token_accuracy": 0.9704707115888596, | |
| "num_tokens": 2493595.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.25028079748153687, | |
| "learning_rate": 7.936437810803708e-05, | |
| "loss": 0.1136, | |
| "mean_token_accuracy": 0.9703996926546097, | |
| "num_tokens": 2500053.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6256, | |
| "grad_norm": 0.20675967633724213, | |
| "learning_rate": 7.892608127976633e-05, | |
| "loss": 0.1018, | |
| "mean_token_accuracy": 0.9746093451976776, | |
| "num_tokens": 2506395.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 0.20429572463035583, | |
| "learning_rate": 7.848861957277485e-05, | |
| "loss": 0.0958, | |
| "mean_token_accuracy": 0.9738153666257858, | |
| "num_tokens": 2512713.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.6288, | |
| "grad_norm": 0.23090408742427826, | |
| "learning_rate": 7.805200474398273e-05, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9680710434913635, | |
| "num_tokens": 2519113.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 0.22681081295013428, | |
| "learning_rate": 7.761624852754992e-05, | |
| "loss": 0.1141, | |
| "mean_token_accuracy": 0.9692568331956863, | |
| "num_tokens": 2525564.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 0.24133142828941345, | |
| "learning_rate": 7.718136263456106e-05, | |
| "loss": 0.1115, | |
| "mean_token_accuracy": 0.970045804977417, | |
| "num_tokens": 2531985.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 0.2129140943288803, | |
| "learning_rate": 7.67473587527104e-05, | |
| "loss": 0.1052, | |
| "mean_token_accuracy": 0.9710945636034012, | |
| "num_tokens": 2538468.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.6352, | |
| "grad_norm": 0.21738120913505554, | |
| "learning_rate": 7.631424854598792e-05, | |
| "loss": 0.1044, | |
| "mean_token_accuracy": 0.9731893539428711, | |
| "num_tokens": 2544787.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 0.22617043554782867, | |
| "learning_rate": 7.588204365436589e-05, | |
| "loss": 0.1245, | |
| "mean_token_accuracy": 0.9708817601203918, | |
| "num_tokens": 2551278.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.6384, | |
| "grad_norm": 0.2163003385066986, | |
| "learning_rate": 7.545075569348579e-05, | |
| "loss": 0.112, | |
| "mean_token_accuracy": 0.9716361314058304, | |
| "num_tokens": 2557701.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.19317835569381714, | |
| "learning_rate": 7.502039625434644e-05, | |
| "loss": 0.1172, | |
| "mean_token_accuracy": 0.96929632127285, | |
| "num_tokens": 2564148.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6416, | |
| "grad_norm": 0.23875229060649872, | |
| "learning_rate": 7.45909769029923e-05, | |
| "loss": 0.1428, | |
| "mean_token_accuracy": 0.9645784199237823, | |
| "num_tokens": 2570632.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 0.19451692700386047, | |
| "learning_rate": 7.41625091802027e-05, | |
| "loss": 0.106, | |
| "mean_token_accuracy": 0.9721459746360779, | |
| "num_tokens": 2577037.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.6448, | |
| "grad_norm": 0.1956557333469391, | |
| "learning_rate": 7.373500460118148e-05, | |
| "loss": 0.1234, | |
| "mean_token_accuracy": 0.9685563743114471, | |
| "num_tokens": 2583442.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 0.24173645675182343, | |
| "learning_rate": 7.330847465524802e-05, | |
| "loss": 0.1103, | |
| "mean_token_accuracy": 0.9694543927907944, | |
| "num_tokens": 2589936.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 0.2031894326210022, | |
| "learning_rate": 7.288293080552785e-05, | |
| "loss": 0.1048, | |
| "mean_token_accuracy": 0.9738620072603226, | |
| "num_tokens": 2596292.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 0.21347053349018097, | |
| "learning_rate": 7.245838448864497e-05, | |
| "loss": 0.1222, | |
| "mean_token_accuracy": 0.9674569964408875, | |
| "num_tokens": 2602763.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.6512, | |
| "grad_norm": 0.21780243515968323, | |
| "learning_rate": 7.203484711441442e-05, | |
| "loss": 0.1118, | |
| "mean_token_accuracy": 0.9686301946640015, | |
| "num_tokens": 2609186.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 0.21225087344646454, | |
| "learning_rate": 7.161233006553545e-05, | |
| "loss": 0.0993, | |
| "mean_token_accuracy": 0.9753576815128326, | |
| "num_tokens": 2615569.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.6544, | |
| "grad_norm": 0.1989884376525879, | |
| "learning_rate": 7.119084469728578e-05, | |
| "loss": 0.1249, | |
| "mean_token_accuracy": 0.971043273806572, | |
| "num_tokens": 2621962.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.19863280653953552, | |
| "learning_rate": 7.077040233721662e-05, | |
| "loss": 0.098, | |
| "mean_token_accuracy": 0.9739304184913635, | |
| "num_tokens": 2628383.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6576, | |
| "grad_norm": 0.21673686802387238, | |
| "learning_rate": 7.035101428484767e-05, | |
| "loss": 0.1238, | |
| "mean_token_accuracy": 0.9696787446737289, | |
| "num_tokens": 2634730.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 0.25210317969322205, | |
| "learning_rate": 6.993269181136397e-05, | |
| "loss": 0.1053, | |
| "mean_token_accuracy": 0.9704767763614655, | |
| "num_tokens": 2641112.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.6608, | |
| "grad_norm": 0.1998644769191742, | |
| "learning_rate": 6.951544615931278e-05, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9719544500112534, | |
| "num_tokens": 2647550.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 0.2235717922449112, | |
| "learning_rate": 6.909928854230146e-05, | |
| "loss": 0.1112, | |
| "mean_token_accuracy": 0.9722190648317337, | |
| "num_tokens": 2654003.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 0.21535825729370117, | |
| "learning_rate": 6.868423014469597e-05, | |
| "loss": 0.0979, | |
| "mean_token_accuracy": 0.9724543541669846, | |
| "num_tokens": 2660373.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 0.21652565896511078, | |
| "learning_rate": 6.827028212132052e-05, | |
| "loss": 0.1013, | |
| "mean_token_accuracy": 0.9751291424036026, | |
| "num_tokens": 2666774.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.6672, | |
| "grad_norm": 0.23333965241909027, | |
| "learning_rate": 6.785745559715774e-05, | |
| "loss": 0.1117, | |
| "mean_token_accuracy": 0.9693506360054016, | |
| "num_tokens": 2673265.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 0.21983280777931213, | |
| "learning_rate": 6.744576166704941e-05, | |
| "loss": 0.1142, | |
| "mean_token_accuracy": 0.9687365144491196, | |
| "num_tokens": 2679706.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.6704, | |
| "grad_norm": 0.23087739944458008, | |
| "learning_rate": 6.703521139539855e-05, | |
| "loss": 0.0983, | |
| "mean_token_accuracy": 0.9735719114542007, | |
| "num_tokens": 2686189.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.22379006445407867, | |
| "learning_rate": 6.66258158158722e-05, | |
| "loss": 0.1022, | |
| "mean_token_accuracy": 0.9740599393844604, | |
| "num_tokens": 2692534.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6736, | |
| "grad_norm": 0.24048611521720886, | |
| "learning_rate": 6.621758593110444e-05, | |
| "loss": 0.1191, | |
| "mean_token_accuracy": 0.9698329567909241, | |
| "num_tokens": 2698978.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 0.21570561826229095, | |
| "learning_rate": 6.581053271240101e-05, | |
| "loss": 0.0975, | |
| "mean_token_accuracy": 0.9741147607564926, | |
| "num_tokens": 2705293.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.6768, | |
| "grad_norm": 0.24703587591648102, | |
| "learning_rate": 6.540466709944446e-05, | |
| "loss": 0.1124, | |
| "mean_token_accuracy": 0.9693271368741989, | |
| "num_tokens": 2711747.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 0.21890997886657715, | |
| "learning_rate": 6.500000000000002e-05, | |
| "loss": 0.1059, | |
| "mean_token_accuracy": 0.9740868806838989, | |
| "num_tokens": 2718231.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.2166839987039566, | |
| "learning_rate": 6.459654228962244e-05, | |
| "loss": 0.0947, | |
| "mean_token_accuracy": 0.9743200689554214, | |
| "num_tokens": 2724582.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 0.21312065422534943, | |
| "learning_rate": 6.419430481136381e-05, | |
| "loss": 0.1041, | |
| "mean_token_accuracy": 0.9726835191249847, | |
| "num_tokens": 2730899.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.6832, | |
| "grad_norm": 0.2374408096075058, | |
| "learning_rate": 6.379329837548216e-05, | |
| "loss": 0.1152, | |
| "mean_token_accuracy": 0.9708255529403687, | |
| "num_tokens": 2737258.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 0.21979407966136932, | |
| "learning_rate": 6.339353375915071e-05, | |
| "loss": 0.0988, | |
| "mean_token_accuracy": 0.9739894717931747, | |
| "num_tokens": 2743597.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.6864, | |
| "grad_norm": 0.21338622272014618, | |
| "learning_rate": 6.29950217061686e-05, | |
| "loss": 0.1172, | |
| "mean_token_accuracy": 0.9711753129959106, | |
| "num_tokens": 2750091.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.22662830352783203, | |
| "learning_rate": 6.259777292667172e-05, | |
| "loss": 0.125, | |
| "mean_token_accuracy": 0.9703481942415237, | |
| "num_tokens": 2756511.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6896, | |
| "grad_norm": 0.22837099432945251, | |
| "learning_rate": 6.220179809684524e-05, | |
| "loss": 0.1162, | |
| "mean_token_accuracy": 0.9711373746395111, | |
| "num_tokens": 2762873.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 0.2118934988975525, | |
| "learning_rate": 6.180710785863655e-05, | |
| "loss": 0.1218, | |
| "mean_token_accuracy": 0.9693483412265778, | |
| "num_tokens": 2769330.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.6928, | |
| "grad_norm": 0.2119901031255722, | |
| "learning_rate": 6.141371281946908e-05, | |
| "loss": 0.1202, | |
| "mean_token_accuracy": 0.9698498249053955, | |
| "num_tokens": 2775759.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 0.23500986397266388, | |
| "learning_rate": 6.102162355195753e-05, | |
| "loss": 0.1209, | |
| "mean_token_accuracy": 0.969989612698555, | |
| "num_tokens": 2782223.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 0.23414058983325958, | |
| "learning_rate": 6.063085059362358e-05, | |
| "loss": 0.1029, | |
| "mean_token_accuracy": 0.9707781374454498, | |
| "num_tokens": 2788606.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 0.2435092329978943, | |
| "learning_rate": 6.024140444661258e-05, | |
| "loss": 0.1256, | |
| "mean_token_accuracy": 0.9695480614900589, | |
| "num_tokens": 2794987.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.6992, | |
| "grad_norm": 0.2444123774766922, | |
| "learning_rate": 5.985329557741146e-05, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9657484143972397, | |
| "num_tokens": 2801426.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 0.22109363973140717, | |
| "learning_rate": 5.946653441656741e-05, | |
| "loss": 0.1144, | |
| "mean_token_accuracy": 0.972969263792038, | |
| "num_tokens": 2807838.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.7024, | |
| "grad_norm": 0.2172664850950241, | |
| "learning_rate": 5.908113135840758e-05, | |
| "loss": 0.091, | |
| "mean_token_accuracy": 0.9764287173748016, | |
| "num_tokens": 2814221.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.18156668543815613, | |
| "learning_rate": 5.8697096760759565e-05, | |
| "loss": 0.1014, | |
| "mean_token_accuracy": 0.9724221527576447, | |
| "num_tokens": 2820687.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7056, | |
| "grad_norm": 0.194484680891037, | |
| "learning_rate": 5.831444094467326e-05, | |
| "loss": 0.1185, | |
| "mean_token_accuracy": 0.9724224805831909, | |
| "num_tokens": 2826999.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 0.24278900027275085, | |
| "learning_rate": 5.793317419414337e-05, | |
| "loss": 0.111, | |
| "mean_token_accuracy": 0.9707483798265457, | |
| "num_tokens": 2833441.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.7088, | |
| "grad_norm": 0.22283653914928436, | |
| "learning_rate": 5.755330675583292e-05, | |
| "loss": 0.1129, | |
| "mean_token_accuracy": 0.9707882255315781, | |
| "num_tokens": 2839959.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 0.2077089548110962, | |
| "learning_rate": 5.717484883879811e-05, | |
| "loss": 0.1112, | |
| "mean_token_accuracy": 0.9716843664646149, | |
| "num_tokens": 2846334.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 0.21356520056724548, | |
| "learning_rate": 5.6797810614213806e-05, | |
| "loss": 0.1045, | |
| "mean_token_accuracy": 0.9725764244794846, | |
| "num_tokens": 2852738.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 0.21537309885025024, | |
| "learning_rate": 5.642220221510008e-05, | |
| "loss": 0.1217, | |
| "mean_token_accuracy": 0.9670560657978058, | |
| "num_tokens": 2859278.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.7152, | |
| "grad_norm": 0.2129800170660019, | |
| "learning_rate": 5.604803373605006e-05, | |
| "loss": 0.1003, | |
| "mean_token_accuracy": 0.9748883098363876, | |
| "num_tokens": 2865625.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 0.28883570432662964, | |
| "learning_rate": 5.567531523295868e-05, | |
| "loss": 0.1041, | |
| "mean_token_accuracy": 0.9712024629116058, | |
| "num_tokens": 2872138.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.7184, | |
| "grad_norm": 0.21833978593349457, | |
| "learning_rate": 5.5304056722752185e-05, | |
| "loss": 0.1052, | |
| "mean_token_accuracy": 0.9744507074356079, | |
| "num_tokens": 2878533.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.2266368716955185, | |
| "learning_rate": 5.493426818311902e-05, | |
| "loss": 0.1081, | |
| "mean_token_accuracy": 0.9740916341543198, | |
| "num_tokens": 2884872.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7216, | |
| "grad_norm": 0.21726545691490173, | |
| "learning_rate": 5.456595955224184e-05, | |
| "loss": 0.1073, | |
| "mean_token_accuracy": 0.9714181870222092, | |
| "num_tokens": 2891238.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 0.22424107789993286, | |
| "learning_rate": 5.419914072853025e-05, | |
| "loss": 0.1058, | |
| "mean_token_accuracy": 0.97254478931427, | |
| "num_tokens": 2897621.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.7248, | |
| "grad_norm": 0.26466894149780273, | |
| "learning_rate": 5.383382157035477e-05, | |
| "loss": 0.1087, | |
| "mean_token_accuracy": 0.9701904803514481, | |
| "num_tokens": 2904022.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 0.2095046192407608, | |
| "learning_rate": 5.347001189578198e-05, | |
| "loss": 0.1046, | |
| "mean_token_accuracy": 0.9721367359161377, | |
| "num_tokens": 2910420.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 0.24525317549705505, | |
| "learning_rate": 5.3107721482310634e-05, | |
| "loss": 0.1251, | |
| "mean_token_accuracy": 0.9690483659505844, | |
| "num_tokens": 2916924.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 0.21807846426963806, | |
| "learning_rate": 5.27469600666088e-05, | |
| "loss": 0.1029, | |
| "mean_token_accuracy": 0.9724613279104233, | |
| "num_tokens": 2923286.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.7312, | |
| "grad_norm": 0.25389304757118225, | |
| "learning_rate": 5.2387737344252365e-05, | |
| "loss": 0.0969, | |
| "mean_token_accuracy": 0.9755337238311768, | |
| "num_tokens": 2929669.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 0.2176523059606552, | |
| "learning_rate": 5.203006296946421e-05, | |
| "loss": 0.105, | |
| "mean_token_accuracy": 0.9717642962932587, | |
| "num_tokens": 2936074.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.7344, | |
| "grad_norm": 0.21290598809719086, | |
| "learning_rate": 5.1673946554855e-05, | |
| "loss": 0.104, | |
| "mean_token_accuracy": 0.9721969962120056, | |
| "num_tokens": 2942459.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.21251799166202545, | |
| "learning_rate": 5.131939767116472e-05, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9719259291887283, | |
| "num_tokens": 2948856.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7376, | |
| "grad_norm": 0.2152663916349411, | |
| "learning_rate": 5.096642584700542e-05, | |
| "loss": 0.1232, | |
| "mean_token_accuracy": 0.9679667055606842, | |
| "num_tokens": 2955303.0, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 0.2524069845676422, | |
| "learning_rate": 5.061504056860522e-05, | |
| "loss": 0.0988, | |
| "mean_token_accuracy": 0.9723905920982361, | |
| "num_tokens": 2961693.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.7408, | |
| "grad_norm": 0.2203751653432846, | |
| "learning_rate": 5.0265251279553304e-05, | |
| "loss": 0.1052, | |
| "mean_token_accuracy": 0.9739673286676407, | |
| "num_tokens": 2968105.0, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 0.26634886860847473, | |
| "learning_rate": 4.991706738054618e-05, | |
| "loss": 0.1157, | |
| "mean_token_accuracy": 0.9690550863742828, | |
| "num_tokens": 2974594.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 0.2508355975151062, | |
| "learning_rate": 4.9570498229134986e-05, | |
| "loss": 0.1286, | |
| "mean_token_accuracy": 0.9678440541028976, | |
| "num_tokens": 2981071.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 0.2097889631986618, | |
| "learning_rate": 4.922555313947397e-05, | |
| "loss": 0.0945, | |
| "mean_token_accuracy": 0.9744881242513657, | |
| "num_tokens": 2987396.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.7472, | |
| "grad_norm": 0.2008487433195114, | |
| "learning_rate": 4.888224138207029e-05, | |
| "loss": 0.1004, | |
| "mean_token_accuracy": 0.9743931442499161, | |
| "num_tokens": 2993764.0, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 0.23793430626392365, | |
| "learning_rate": 4.8540572183534676e-05, | |
| "loss": 0.1172, | |
| "mean_token_accuracy": 0.9689698666334152, | |
| "num_tokens": 3000193.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.7504, | |
| "grad_norm": 0.25740739703178406, | |
| "learning_rate": 4.8200554726333695e-05, | |
| "loss": 0.1179, | |
| "mean_token_accuracy": 0.9703273624181747, | |
| "num_tokens": 3006678.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.2496376484632492, | |
| "learning_rate": 4.7862198148542804e-05, | |
| "loss": 0.1236, | |
| "mean_token_accuracy": 0.9692432284355164, | |
| "num_tokens": 3013217.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7536, | |
| "grad_norm": 0.20700585842132568, | |
| "learning_rate": 4.752551154360077e-05, | |
| "loss": 0.1064, | |
| "mean_token_accuracy": 0.9738883674144745, | |
| "num_tokens": 3019542.0, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 0.22873146831989288, | |
| "learning_rate": 4.719050396006535e-05, | |
| "loss": 0.1106, | |
| "mean_token_accuracy": 0.9706408679485321, | |
| "num_tokens": 3025955.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.7568, | |
| "grad_norm": 0.2061772644519806, | |
| "learning_rate": 4.685718440137011e-05, | |
| "loss": 0.1053, | |
| "mean_token_accuracy": 0.9729923903942108, | |
| "num_tokens": 3032365.0, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 0.20927031338214874, | |
| "learning_rate": 4.652556182558237e-05, | |
| "loss": 0.0979, | |
| "mean_token_accuracy": 0.9735575169324875, | |
| "num_tokens": 3038734.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.20960409939289093, | |
| "learning_rate": 4.619564514516245e-05, | |
| "loss": 0.1047, | |
| "mean_token_accuracy": 0.9743406474590302, | |
| "num_tokens": 3045101.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 0.20999054610729218, | |
| "learning_rate": 4.5867443226724386e-05, | |
| "loss": 0.1237, | |
| "mean_token_accuracy": 0.9704978615045547, | |
| "num_tokens": 3051544.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.7632, | |
| "grad_norm": 0.2393818497657776, | |
| "learning_rate": 4.554096489079727e-05, | |
| "loss": 0.1113, | |
| "mean_token_accuracy": 0.9697568118572235, | |
| "num_tokens": 3057939.0, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 0.242337167263031, | |
| "learning_rate": 4.5216218911588396e-05, | |
| "loss": 0.083, | |
| "mean_token_accuracy": 0.9772262275218964, | |
| "num_tokens": 3064259.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.7664, | |
| "grad_norm": 0.21983551979064941, | |
| "learning_rate": 4.489321401674753e-05, | |
| "loss": 0.1141, | |
| "mean_token_accuracy": 0.968664214015007, | |
| "num_tokens": 3070788.0, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.2546117901802063, | |
| "learning_rate": 4.457195888713219e-05, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9657116234302521, | |
| "num_tokens": 3077278.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7696, | |
| "grad_norm": 0.23078176379203796, | |
| "learning_rate": 4.425246215657436e-05, | |
| "loss": 0.1061, | |
| "mean_token_accuracy": 0.9717554599046707, | |
| "num_tokens": 3083789.0, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 0.20604689419269562, | |
| "learning_rate": 4.3934732411648585e-05, | |
| "loss": 0.1025, | |
| "mean_token_accuracy": 0.9724752753973007, | |
| "num_tokens": 3090157.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.7728, | |
| "grad_norm": 0.2543022334575653, | |
| "learning_rate": 4.36187781914411e-05, | |
| "loss": 0.1126, | |
| "mean_token_accuracy": 0.9690060913562775, | |
| "num_tokens": 3096538.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 0.23018792271614075, | |
| "learning_rate": 4.33046079873203e-05, | |
| "loss": 0.1252, | |
| "mean_token_accuracy": 0.9674163907766342, | |
| "num_tokens": 3103061.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 0.19241195917129517, | |
| "learning_rate": 4.2992230242708645e-05, | |
| "loss": 0.0953, | |
| "mean_token_accuracy": 0.9743133783340454, | |
| "num_tokens": 3109342.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 0.20888718962669373, | |
| "learning_rate": 4.268165335285566e-05, | |
| "loss": 0.1036, | |
| "mean_token_accuracy": 0.9740147739648819, | |
| "num_tokens": 3115697.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.7792, | |
| "grad_norm": 0.23827168345451355, | |
| "learning_rate": 4.237288566461235e-05, | |
| "loss": 0.1148, | |
| "mean_token_accuracy": 0.972451776266098, | |
| "num_tokens": 3122099.0, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 0.22637143731117249, | |
| "learning_rate": 4.2065935476206885e-05, | |
| "loss": 0.1124, | |
| "mean_token_accuracy": 0.9709622263908386, | |
| "num_tokens": 3128517.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.7824, | |
| "grad_norm": 0.19767995178699493, | |
| "learning_rate": 4.1760811037021484e-05, | |
| "loss": 0.1017, | |
| "mean_token_accuracy": 0.971914678812027, | |
| "num_tokens": 3134870.0, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.2062818557024002, | |
| "learning_rate": 4.145752054737087e-05, | |
| "loss": 0.1186, | |
| "mean_token_accuracy": 0.9698653817176819, | |
| "num_tokens": 3141313.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7856, | |
| "grad_norm": 0.20594243705272675, | |
| "learning_rate": 4.115607215828172e-05, | |
| "loss": 0.1061, | |
| "mean_token_accuracy": 0.973545104265213, | |
| "num_tokens": 3147678.0, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 0.23413920402526855, | |
| "learning_rate": 4.085647397127376e-05, | |
| "loss": 0.113, | |
| "mean_token_accuracy": 0.9707321226596832, | |
| "num_tokens": 3154080.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.7888, | |
| "grad_norm": 0.2359134703874588, | |
| "learning_rate": 4.055873403814191e-05, | |
| "loss": 0.1359, | |
| "mean_token_accuracy": 0.9669735133647919, | |
| "num_tokens": 3160519.0, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 0.19444267451763153, | |
| "learning_rate": 4.0262860360739915e-05, | |
| "loss": 0.1006, | |
| "mean_token_accuracy": 0.9741977751255035, | |
| "num_tokens": 3166881.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 0.24528303742408752, | |
| "learning_rate": 3.996886089076541e-05, | |
| "loss": 0.1091, | |
| "mean_token_accuracy": 0.971895158290863, | |
| "num_tokens": 3173273.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 0.2260335236787796, | |
| "learning_rate": 3.967674352954599e-05, | |
| "loss": 0.108, | |
| "mean_token_accuracy": 0.9725111573934555, | |
| "num_tokens": 3179726.0, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.7952, | |
| "grad_norm": 0.22950369119644165, | |
| "learning_rate": 3.938651612782707e-05, | |
| "loss": 0.105, | |
| "mean_token_accuracy": 0.9726218730211258, | |
| "num_tokens": 3186131.0, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 0.2242397516965866, | |
| "learning_rate": 3.909818648556082e-05, | |
| "loss": 0.1073, | |
| "mean_token_accuracy": 0.9702429622411728, | |
| "num_tokens": 3192533.0, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.7984, | |
| "grad_norm": 0.2524878978729248, | |
| "learning_rate": 3.881176235169648e-05, | |
| "loss": 0.1076, | |
| "mean_token_accuracy": 0.9705865532159805, | |
| "num_tokens": 3198976.0, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.23061899840831757, | |
| "learning_rate": 3.852725142397219e-05, | |
| "loss": 0.121, | |
| "mean_token_accuracy": 0.9713504761457443, | |
| "num_tokens": 3205409.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8016, | |
| "grad_norm": 0.21787267923355103, | |
| "learning_rate": 3.8244661348708086e-05, | |
| "loss": 0.1032, | |
| "mean_token_accuracy": 0.9727955311536789, | |
| "num_tokens": 3211844.0, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 0.2047341763973236, | |
| "learning_rate": 3.796399972060077e-05, | |
| "loss": 0.0976, | |
| "mean_token_accuracy": 0.9754393398761749, | |
| "num_tokens": 3218134.0, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.8048, | |
| "grad_norm": 0.2240447998046875, | |
| "learning_rate": 3.768527408251918e-05, | |
| "loss": 0.1183, | |
| "mean_token_accuracy": 0.9705005437135696, | |
| "num_tokens": 3224500.0, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 0.23893991112709045, | |
| "learning_rate": 3.740849192530206e-05, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9692594110965729, | |
| "num_tokens": 3230910.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 0.2310982346534729, | |
| "learning_rate": 3.713366068755636e-05, | |
| "loss": 0.1284, | |
| "mean_token_accuracy": 0.9683378338813782, | |
| "num_tokens": 3237522.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 0.25196003913879395, | |
| "learning_rate": 3.6860787755457494e-05, | |
| "loss": 0.1294, | |
| "mean_token_accuracy": 0.9677963703870773, | |
| "num_tokens": 3244000.0, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.8112, | |
| "grad_norm": 0.23492568731307983, | |
| "learning_rate": 3.658988046255081e-05, | |
| "loss": 0.1232, | |
| "mean_token_accuracy": 0.970322772860527, | |
| "num_tokens": 3250425.0, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 0.21956846117973328, | |
| "learning_rate": 3.632094608955453e-05, | |
| "loss": 0.1018, | |
| "mean_token_accuracy": 0.9748154282569885, | |
| "num_tokens": 3256796.0, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.8144, | |
| "grad_norm": 0.23197206854820251, | |
| "learning_rate": 3.605399186416395e-05, | |
| "loss": 0.1042, | |
| "mean_token_accuracy": 0.9718388617038727, | |
| "num_tokens": 3263241.0, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.21163803339004517, | |
| "learning_rate": 3.578902496085734e-05, | |
| "loss": 0.1108, | |
| "mean_token_accuracy": 0.9713537245988846, | |
| "num_tokens": 3269642.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8176, | |
| "grad_norm": 0.2193506956100464, | |
| "learning_rate": 3.55260525007031e-05, | |
| "loss": 0.0949, | |
| "mean_token_accuracy": 0.9723983258008957, | |
| "num_tokens": 3275983.0, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 0.23003201186656952, | |
| "learning_rate": 3.5265081551168225e-05, | |
| "loss": 0.1012, | |
| "mean_token_accuracy": 0.9736301898956299, | |
| "num_tokens": 3282325.0, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.8208, | |
| "grad_norm": 0.2487151026725769, | |
| "learning_rate": 3.500611912592861e-05, | |
| "loss": 0.1215, | |
| "mean_token_accuracy": 0.9720029532909393, | |
| "num_tokens": 3288695.0, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 0.21626487374305725, | |
| "learning_rate": 3.474917218468039e-05, | |
| "loss": 0.11, | |
| "mean_token_accuracy": 0.9702286571264267, | |
| "num_tokens": 3295221.0, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 0.23146726191043854, | |
| "learning_rate": 3.449424763295291e-05, | |
| "loss": 0.1004, | |
| "mean_token_accuracy": 0.9732625186443329, | |
| "num_tokens": 3301620.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 0.2483750432729721, | |
| "learning_rate": 3.424135232192318e-05, | |
| "loss": 0.1307, | |
| "mean_token_accuracy": 0.9673274010419846, | |
| "num_tokens": 3308078.0, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.8272, | |
| "grad_norm": 0.24815554916858673, | |
| "learning_rate": 3.399049304823175e-05, | |
| "loss": 0.1176, | |
| "mean_token_accuracy": 0.9694186002016068, | |
| "num_tokens": 3314599.0, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 0.22502459585666656, | |
| "learning_rate": 3.374167655380004e-05, | |
| "loss": 0.1098, | |
| "mean_token_accuracy": 0.9722630530595779, | |
| "num_tokens": 3320940.0, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.8304, | |
| "grad_norm": 0.253227174282074, | |
| "learning_rate": 3.349490952564907e-05, | |
| "loss": 0.1175, | |
| "mean_token_accuracy": 0.9711975604295731, | |
| "num_tokens": 3327305.0, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.2237836867570877, | |
| "learning_rate": 3.3250198595719895e-05, | |
| "loss": 0.0919, | |
| "mean_token_accuracy": 0.9758049696683884, | |
| "num_tokens": 3333642.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8336, | |
| "grad_norm": 0.22460657358169556, | |
| "learning_rate": 3.300755034069527e-05, | |
| "loss": 0.1079, | |
| "mean_token_accuracy": 0.972212627530098, | |
| "num_tokens": 3340052.0, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 0.2121167778968811, | |
| "learning_rate": 3.2766971281822844e-05, | |
| "loss": 0.1033, | |
| "mean_token_accuracy": 0.9736848026514053, | |
| "num_tokens": 3346376.0, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.8368, | |
| "grad_norm": 0.2357485592365265, | |
| "learning_rate": 3.252846788474009e-05, | |
| "loss": 0.1107, | |
| "mean_token_accuracy": 0.9712750613689423, | |
| "num_tokens": 3352792.0, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 0.2749295234680176, | |
| "learning_rate": 3.2292046559300295e-05, | |
| "loss": 0.1173, | |
| "mean_token_accuracy": 0.9701267778873444, | |
| "num_tokens": 3359227.0, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.22125522792339325, | |
| "learning_rate": 3.205771365940052e-05, | |
| "loss": 0.1003, | |
| "mean_token_accuracy": 0.9746531993150711, | |
| "num_tokens": 3365627.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 0.24374344944953918, | |
| "learning_rate": 3.182547548281073e-05, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9698005616664886, | |
| "num_tokens": 3372159.0, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.8432, | |
| "grad_norm": 0.23680374026298523, | |
| "learning_rate": 3.159533827100446e-05, | |
| "loss": 0.1204, | |
| "mean_token_accuracy": 0.970520094037056, | |
| "num_tokens": 3378712.0, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 0.2179926335811615, | |
| "learning_rate": 3.136730820899126e-05, | |
| "loss": 0.1, | |
| "mean_token_accuracy": 0.9724764376878738, | |
| "num_tokens": 3385119.0, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.8464, | |
| "grad_norm": 0.21316486597061157, | |
| "learning_rate": 3.114139142515032e-05, | |
| "loss": 0.1039, | |
| "mean_token_accuracy": 0.972663402557373, | |
| "num_tokens": 3391466.0, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.20808176696300507, | |
| "learning_rate": 3.0917593991065836e-05, | |
| "loss": 0.1049, | |
| "mean_token_accuracy": 0.9736933261156082, | |
| "num_tokens": 3397865.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8496, | |
| "grad_norm": 0.23780664801597595, | |
| "learning_rate": 3.0695921921363744e-05, | |
| "loss": 0.1078, | |
| "mean_token_accuracy": 0.9726853370666504, | |
| "num_tokens": 3404255.0, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 0.2280525118112564, | |
| "learning_rate": 3.0476381173550295e-05, | |
| "loss": 0.0989, | |
| "mean_token_accuracy": 0.9736845791339874, | |
| "num_tokens": 3410706.0, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.8528, | |
| "grad_norm": 0.23383153975009918, | |
| "learning_rate": 3.0258977647851683e-05, | |
| "loss": 0.1164, | |
| "mean_token_accuracy": 0.9709266275167465, | |
| "num_tokens": 3417150.0, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 0.1975809931755066, | |
| "learning_rate": 3.0043717187055603e-05, | |
| "loss": 0.0969, | |
| "mean_token_accuracy": 0.9738510549068451, | |
| "num_tokens": 3423576.0, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 0.23262329399585724, | |
| "learning_rate": 2.9830605576354237e-05, | |
| "loss": 0.1033, | |
| "mean_token_accuracy": 0.9711253345012665, | |
| "num_tokens": 3429919.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 0.22562484443187714, | |
| "learning_rate": 2.9619648543188773e-05, | |
| "loss": 0.1007, | |
| "mean_token_accuracy": 0.9721808582544327, | |
| "num_tokens": 3436331.0, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.8592, | |
| "grad_norm": 0.2221885323524475, | |
| "learning_rate": 2.9410851757095374e-05, | |
| "loss": 0.1054, | |
| "mean_token_accuracy": 0.9715123325586319, | |
| "num_tokens": 3442757.0, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 0.2144557535648346, | |
| "learning_rate": 2.9204220829552954e-05, | |
| "loss": 0.1039, | |
| "mean_token_accuracy": 0.9732652753591537, | |
| "num_tokens": 3449174.0, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.8624, | |
| "grad_norm": 0.23402085900306702, | |
| "learning_rate": 2.8999761313832303e-05, | |
| "loss": 0.1051, | |
| "mean_token_accuracy": 0.9724318087100983, | |
| "num_tokens": 3455530.0, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.21033795177936554, | |
| "learning_rate": 2.8797478704846815e-05, | |
| "loss": 0.0894, | |
| "mean_token_accuracy": 0.9749423116445541, | |
| "num_tokens": 3461848.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8656, | |
| "grad_norm": 0.25800594687461853, | |
| "learning_rate": 2.8597378439004774e-05, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.9645904451608658, | |
| "num_tokens": 3468390.0, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 0.2593214511871338, | |
| "learning_rate": 2.839946589406348e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9678925722837448, | |
| "num_tokens": 3474899.0, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.8688, | |
| "grad_norm": 0.22763004899024963, | |
| "learning_rate": 2.8203746388984385e-05, | |
| "loss": 0.1032, | |
| "mean_token_accuracy": 0.9715708196163177, | |
| "num_tokens": 3481384.0, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 0.21885231137275696, | |
| "learning_rate": 2.801022518379036e-05, | |
| "loss": 0.1038, | |
| "mean_token_accuracy": 0.9714048355817795, | |
| "num_tokens": 3487812.0, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 0.24403344094753265, | |
| "learning_rate": 2.7818907479424305e-05, | |
| "loss": 0.0941, | |
| "mean_token_accuracy": 0.974847212433815, | |
| "num_tokens": 3494294.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 0.23599281907081604, | |
| "learning_rate": 2.762979841760936e-05, | |
| "loss": 0.1104, | |
| "mean_token_accuracy": 0.973253071308136, | |
| "num_tokens": 3500645.0, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.8752, | |
| "grad_norm": 0.24596212804317474, | |
| "learning_rate": 2.744290308071063e-05, | |
| "loss": 0.1055, | |
| "mean_token_accuracy": 0.9721562415361404, | |
| "num_tokens": 3507012.0, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 0.20906655490398407, | |
| "learning_rate": 2.7258226491598726e-05, | |
| "loss": 0.1007, | |
| "mean_token_accuracy": 0.9728522747755051, | |
| "num_tokens": 3513475.0, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.8784, | |
| "grad_norm": 0.25883007049560547, | |
| "learning_rate": 2.7075773613514748e-05, | |
| "loss": 0.1192, | |
| "mean_token_accuracy": 0.9691367298364639, | |
| "num_tokens": 3519967.0, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.2278936356306076, | |
| "learning_rate": 2.6895549349936776e-05, | |
| "loss": 0.1043, | |
| "mean_token_accuracy": 0.9696421474218369, | |
| "num_tokens": 3526409.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8816, | |
| "grad_norm": 0.25115153193473816, | |
| "learning_rate": 2.6717558544448274e-05, | |
| "loss": 0.1173, | |
| "mean_token_accuracy": 0.9696227312088013, | |
| "num_tokens": 3532912.0, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 0.21208693087100983, | |
| "learning_rate": 2.6541805980607743e-05, | |
| "loss": 0.1117, | |
| "mean_token_accuracy": 0.9704534560441971, | |
| "num_tokens": 3539295.0, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.8848, | |
| "grad_norm": 0.20980143547058105, | |
| "learning_rate": 2.63682963818203e-05, | |
| "loss": 0.1177, | |
| "mean_token_accuracy": 0.9700057953596115, | |
| "num_tokens": 3545780.0, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 0.2320992797613144, | |
| "learning_rate": 2.6197034411210653e-05, | |
| "loss": 0.1093, | |
| "mean_token_accuracy": 0.9706778675317764, | |
| "num_tokens": 3552260.0, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 0.22713179886341095, | |
| "learning_rate": 2.602802467149778e-05, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9715012460947037, | |
| "num_tokens": 3558725.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 0.2165389358997345, | |
| "learning_rate": 2.586127170487127e-05, | |
| "loss": 0.1038, | |
| "mean_token_accuracy": 0.974217414855957, | |
| "num_tokens": 3565028.0, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.8912, | |
| "grad_norm": 0.2848284840583801, | |
| "learning_rate": 2.5696779992869253e-05, | |
| "loss": 0.0932, | |
| "mean_token_accuracy": 0.975478395819664, | |
| "num_tokens": 3571320.0, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 0.21922437846660614, | |
| "learning_rate": 2.553455395625788e-05, | |
| "loss": 0.1141, | |
| "mean_token_accuracy": 0.9696360230445862, | |
| "num_tokens": 3577784.0, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.8944, | |
| "grad_norm": 0.22478172183036804, | |
| "learning_rate": 2.53745979549126e-05, | |
| "loss": 0.1064, | |
| "mean_token_accuracy": 0.9716362059116364, | |
| "num_tokens": 3584252.0, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.21486598253250122, | |
| "learning_rate": 2.5216916287700988e-05, | |
| "loss": 0.0918, | |
| "mean_token_accuracy": 0.9771014899015427, | |
| "num_tokens": 3590552.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8976, | |
| "grad_norm": 0.1994141787290573, | |
| "learning_rate": 2.5061513192367097e-05, | |
| "loss": 0.0989, | |
| "mean_token_accuracy": 0.9735328704118729, | |
| "num_tokens": 3596957.0, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 0.2314392626285553, | |
| "learning_rate": 2.4908392845417688e-05, | |
| "loss": 0.1046, | |
| "mean_token_accuracy": 0.9704307615756989, | |
| "num_tokens": 3603398.0, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.9008, | |
| "grad_norm": 0.23387351632118225, | |
| "learning_rate": 2.4757559362009992e-05, | |
| "loss": 0.1116, | |
| "mean_token_accuracy": 0.9691499918699265, | |
| "num_tokens": 3609925.0, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 0.2047821581363678, | |
| "learning_rate": 2.4609016795841e-05, | |
| "loss": 0.1021, | |
| "mean_token_accuracy": 0.9772474765777588, | |
| "num_tokens": 3616233.0, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 0.2516949772834778, | |
| "learning_rate": 2.4462769139038606e-05, | |
| "loss": 0.1027, | |
| "mean_token_accuracy": 0.9723426103591919, | |
| "num_tokens": 3622613.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 0.20579637587070465, | |
| "learning_rate": 2.431882032205431e-05, | |
| "loss": 0.1059, | |
| "mean_token_accuracy": 0.9714504182338715, | |
| "num_tokens": 3629074.0, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.9072, | |
| "grad_norm": 0.2183055281639099, | |
| "learning_rate": 2.417717421355758e-05, | |
| "loss": 0.0969, | |
| "mean_token_accuracy": 0.9734102934598923, | |
| "num_tokens": 3635498.0, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 0.24122858047485352, | |
| "learning_rate": 2.4037834620331855e-05, | |
| "loss": 0.1, | |
| "mean_token_accuracy": 0.9758709222078323, | |
| "num_tokens": 3641963.0, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.9104, | |
| "grad_norm": 0.2248363047838211, | |
| "learning_rate": 2.390080528717223e-05, | |
| "loss": 0.1091, | |
| "mean_token_accuracy": 0.9742082208395004, | |
| "num_tokens": 3648355.0, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.24981184303760529, | |
| "learning_rate": 2.376608989678491e-05, | |
| "loss": 0.1267, | |
| "mean_token_accuracy": 0.9681502133607864, | |
| "num_tokens": 3654848.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.9136, | |
| "grad_norm": 0.2261212319135666, | |
| "learning_rate": 2.363369206968811e-05, | |
| "loss": 0.1071, | |
| "mean_token_accuracy": 0.9735273569822311, | |
| "num_tokens": 3661203.0, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 0.23203207552433014, | |
| "learning_rate": 2.3503615364114796e-05, | |
| "loss": 0.1151, | |
| "mean_token_accuracy": 0.9706272333860397, | |
| "num_tokens": 3667634.0, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.9168, | |
| "grad_norm": 0.2189394235610962, | |
| "learning_rate": 2.337586327591712e-05, | |
| "loss": 0.0983, | |
| "mean_token_accuracy": 0.9768995195627213, | |
| "num_tokens": 3674003.0, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 0.2351296842098236, | |
| "learning_rate": 2.3250439238472392e-05, | |
| "loss": 0.1147, | |
| "mean_token_accuracy": 0.971548855304718, | |
| "num_tokens": 3680397.0, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.21025338768959045, | |
| "learning_rate": 2.3127346622590796e-05, | |
| "loss": 0.0915, | |
| "mean_token_accuracy": 0.9755335301160812, | |
| "num_tokens": 3686765.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 0.22534166276454926, | |
| "learning_rate": 2.3006588736424883e-05, | |
| "loss": 0.1183, | |
| "mean_token_accuracy": 0.9688543230295181, | |
| "num_tokens": 3693171.0, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.9232, | |
| "grad_norm": 0.2072216421365738, | |
| "learning_rate": 2.28881688253806e-05, | |
| "loss": 0.0947, | |
| "mean_token_accuracy": 0.974353164434433, | |
| "num_tokens": 3699566.0, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 0.2115691751241684, | |
| "learning_rate": 2.277209007203005e-05, | |
| "loss": 0.0948, | |
| "mean_token_accuracy": 0.9741936177015305, | |
| "num_tokens": 3705856.0, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.9264, | |
| "grad_norm": 0.23990494012832642, | |
| "learning_rate": 2.2658355596026043e-05, | |
| "loss": 0.087, | |
| "mean_token_accuracy": 0.9763388186693192, | |
| "num_tokens": 3712208.0, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.217073455452919, | |
| "learning_rate": 2.2546968454018146e-05, | |
| "loss": 0.0987, | |
| "mean_token_accuracy": 0.9748837798833847, | |
| "num_tokens": 3718463.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9296, | |
| "grad_norm": 0.22759057581424713, | |
| "learning_rate": 2.243793163957063e-05, | |
| "loss": 0.1049, | |
| "mean_token_accuracy": 0.9736155718564987, | |
| "num_tokens": 3724807.0, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 0.22293473780155182, | |
| "learning_rate": 2.233124808308198e-05, | |
| "loss": 0.1156, | |
| "mean_token_accuracy": 0.9709077328443527, | |
| "num_tokens": 3731176.0, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.9328, | |
| "grad_norm": 0.20852118730545044, | |
| "learning_rate": 2.2226920651706117e-05, | |
| "loss": 0.097, | |
| "mean_token_accuracy": 0.9740815609693527, | |
| "num_tokens": 3737662.0, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 0.22608016431331635, | |
| "learning_rate": 2.2124952149275372e-05, | |
| "loss": 0.1119, | |
| "mean_token_accuracy": 0.9715410768985748, | |
| "num_tokens": 3744001.0, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 0.2192792445421219, | |
| "learning_rate": 2.2025345316225125e-05, | |
| "loss": 0.1072, | |
| "mean_token_accuracy": 0.9721464216709137, | |
| "num_tokens": 3750369.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 0.22682707011699677, | |
| "learning_rate": 2.192810282952013e-05, | |
| "loss": 0.106, | |
| "mean_token_accuracy": 0.9711346477270126, | |
| "num_tokens": 3756776.0, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.9392, | |
| "grad_norm": 0.2189428061246872, | |
| "learning_rate": 2.1833227302582646e-05, | |
| "loss": 0.1001, | |
| "mean_token_accuracy": 0.973629042506218, | |
| "num_tokens": 3763182.0, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 0.2045506238937378, | |
| "learning_rate": 2.174072128522212e-05, | |
| "loss": 0.0969, | |
| "mean_token_accuracy": 0.9742176085710526, | |
| "num_tokens": 3769545.0, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.9424, | |
| "grad_norm": 0.22102801501750946, | |
| "learning_rate": 2.1650587263566658e-05, | |
| "loss": 0.0996, | |
| "mean_token_accuracy": 0.9751187860965729, | |
| "num_tokens": 3775864.0, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.1995226889848709, | |
| "learning_rate": 2.1562827659996272e-05, | |
| "loss": 0.0928, | |
| "mean_token_accuracy": 0.9757037907838821, | |
| "num_tokens": 3782257.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9456, | |
| "grad_norm": 0.22786815464496613, | |
| "learning_rate": 2.147744483307776e-05, | |
| "loss": 0.1075, | |
| "mean_token_accuracy": 0.9706543982028961, | |
| "num_tokens": 3788846.0, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 0.22925198078155518, | |
| "learning_rate": 2.1394441077501263e-05, | |
| "loss": 0.1055, | |
| "mean_token_accuracy": 0.9691334515810013, | |
| "num_tokens": 3795365.0, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.9488, | |
| "grad_norm": 0.23679634928703308, | |
| "learning_rate": 2.1313818624018684e-05, | |
| "loss": 0.1178, | |
| "mean_token_accuracy": 0.9704981744289398, | |
| "num_tokens": 3801781.0, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 0.21451805531978607, | |
| "learning_rate": 2.123557963938364e-05, | |
| "loss": 0.0984, | |
| "mean_token_accuracy": 0.976891040802002, | |
| "num_tokens": 3808115.0, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 0.20589299499988556, | |
| "learning_rate": 2.1159726226293323e-05, | |
| "loss": 0.0842, | |
| "mean_token_accuracy": 0.9760564416646957, | |
| "num_tokens": 3814481.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 0.2425495684146881, | |
| "learning_rate": 2.10862604233319e-05, | |
| "loss": 0.0992, | |
| "mean_token_accuracy": 0.9721360504627228, | |
| "num_tokens": 3820960.0, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.9552, | |
| "grad_norm": 0.26068660616874695, | |
| "learning_rate": 2.1015184204915797e-05, | |
| "loss": 0.1356, | |
| "mean_token_accuracy": 0.9667875170707703, | |
| "num_tokens": 3827413.0, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 0.2210782766342163, | |
| "learning_rate": 2.094649948124061e-05, | |
| "loss": 0.0991, | |
| "mean_token_accuracy": 0.9746987223625183, | |
| "num_tokens": 3833832.0, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.9584, | |
| "grad_norm": 0.22755607962608337, | |
| "learning_rate": 2.088020809822976e-05, | |
| "loss": 0.1112, | |
| "mean_token_accuracy": 0.9687243103981018, | |
| "num_tokens": 3840279.0, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.2218703180551529, | |
| "learning_rate": 2.0816311837484866e-05, | |
| "loss": 0.1068, | |
| "mean_token_accuracy": 0.9720739126205444, | |
| "num_tokens": 3846737.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9616, | |
| "grad_norm": 0.24559172987937927, | |
| "learning_rate": 2.075481241623793e-05, | |
| "loss": 0.1141, | |
| "mean_token_accuracy": 0.9725874066352844, | |
| "num_tokens": 3853132.0, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 0.9632, | |
| "grad_norm": 0.23787567019462585, | |
| "learning_rate": 2.0695711487305118e-05, | |
| "loss": 0.1055, | |
| "mean_token_accuracy": 0.9718718379735947, | |
| "num_tokens": 3859426.0, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.9648, | |
| "grad_norm": 0.23051463067531586, | |
| "learning_rate": 2.0639010639042368e-05, | |
| "loss": 0.1011, | |
| "mean_token_accuracy": 0.9732188433408737, | |
| "num_tokens": 3865825.0, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 0.21166487038135529, | |
| "learning_rate": 2.0584711395302697e-05, | |
| "loss": 0.0928, | |
| "mean_token_accuracy": 0.975051075220108, | |
| "num_tokens": 3872251.0, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 0.24081633985042572, | |
| "learning_rate": 2.053281521539527e-05, | |
| "loss": 0.0963, | |
| "mean_token_accuracy": 0.9765129089355469, | |
| "num_tokens": 3878612.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.9696, | |
| "grad_norm": 0.2403188794851303, | |
| "learning_rate": 2.048332349404613e-05, | |
| "loss": 0.0997, | |
| "mean_token_accuracy": 0.9744224399328232, | |
| "num_tokens": 3884976.0, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.9712, | |
| "grad_norm": 0.2533579468727112, | |
| "learning_rate": 2.0436237561360763e-05, | |
| "loss": 0.0993, | |
| "mean_token_accuracy": 0.9729606509208679, | |
| "num_tokens": 3891334.0, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 0.2714720070362091, | |
| "learning_rate": 2.0391558682788327e-05, | |
| "loss": 0.1024, | |
| "mean_token_accuracy": 0.9731218069791794, | |
| "num_tokens": 3897740.0, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.9744, | |
| "grad_norm": 0.24020877480506897, | |
| "learning_rate": 2.0349288059087655e-05, | |
| "loss": 0.113, | |
| "mean_token_accuracy": 0.9712254405021667, | |
| "num_tokens": 3904080.0, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.2139822244644165, | |
| "learning_rate": 2.0309426826294975e-05, | |
| "loss": 0.1073, | |
| "mean_token_accuracy": 0.9721011370420456, | |
| "num_tokens": 3910491.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9776, | |
| "grad_norm": 0.24366411566734314, | |
| "learning_rate": 2.0271976055693368e-05, | |
| "loss": 0.0949, | |
| "mean_token_accuracy": 0.973763182759285, | |
| "num_tokens": 3916819.0, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 0.20686177909374237, | |
| "learning_rate": 2.023693675378401e-05, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.9776096940040588, | |
| "num_tokens": 3923125.0, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.9808, | |
| "grad_norm": 0.20820122957229614, | |
| "learning_rate": 2.020430986225909e-05, | |
| "loss": 0.094, | |
| "mean_token_accuracy": 0.9756556898355484, | |
| "num_tokens": 3929496.0, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 0.9824, | |
| "grad_norm": 0.2249443382024765, | |
| "learning_rate": 2.0174096257976515e-05, | |
| "loss": 0.1083, | |
| "mean_token_accuracy": 0.9736926257610321, | |
| "num_tokens": 3935859.0, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 0.2198774665594101, | |
| "learning_rate": 2.0146296752936345e-05, | |
| "loss": 0.097, | |
| "mean_token_accuracy": 0.9754761904478073, | |
| "num_tokens": 3942237.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 0.23392657935619354, | |
| "learning_rate": 2.012091209425897e-05, | |
| "loss": 0.1074, | |
| "mean_token_accuracy": 0.9731382429599762, | |
| "num_tokens": 3948529.0, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.9872, | |
| "grad_norm": 0.23008355498313904, | |
| "learning_rate": 2.0097942964165028e-05, | |
| "loss": 0.1033, | |
| "mean_token_accuracy": 0.972972571849823, | |
| "num_tokens": 3954829.0, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 0.9888, | |
| "grad_norm": 0.24449172616004944, | |
| "learning_rate": 2.0077389979957064e-05, | |
| "loss": 0.104, | |
| "mean_token_accuracy": 0.9740715324878693, | |
| "num_tokens": 3961285.0, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.9904, | |
| "grad_norm": 0.2150583118200302, | |
| "learning_rate": 2.0059253694002965e-05, | |
| "loss": 0.0995, | |
| "mean_token_accuracy": 0.9742254763841629, | |
| "num_tokens": 3967690.0, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.2424461841583252, | |
| "learning_rate": 2.0043534593721075e-05, | |
| "loss": 0.1047, | |
| "mean_token_accuracy": 0.9742003828287125, | |
| "num_tokens": 3974106.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9936, | |
| "grad_norm": 0.23780013620853424, | |
| "learning_rate": 2.0030233101567135e-05, | |
| "loss": 0.0964, | |
| "mean_token_accuracy": 0.9739386290311813, | |
| "num_tokens": 3980562.0, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 0.9952, | |
| "grad_norm": 0.22070133686065674, | |
| "learning_rate": 2.0019349575022896e-05, | |
| "loss": 0.1046, | |
| "mean_token_accuracy": 0.9738975316286087, | |
| "num_tokens": 3986975.0, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 0.9968, | |
| "grad_norm": 0.2582930624485016, | |
| "learning_rate": 2.0010884306586545e-05, | |
| "loss": 0.1074, | |
| "mean_token_accuracy": 0.9711170047521591, | |
| "num_tokens": 3993359.0, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 0.24187861382961273, | |
| "learning_rate": 2.000483752376482e-05, | |
| "loss": 0.102, | |
| "mean_token_accuracy": 0.972094938158989, | |
| "num_tokens": 3999747.0, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.23207087814807892, | |
| "learning_rate": 2.0001209389066884e-05, | |
| "loss": 0.1079, | |
| "mean_token_accuracy": 0.9717442691326141, | |
| "num_tokens": 4006123.0, | |
| "step": 625 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 625, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.1043617489556224e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |