| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9990717600856835, | |
| "eval_steps": 500, | |
| "global_step": 7002, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004284184219921457, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 1.0688836104513063e-05, | |
| "loss": 2.91, | |
| "mean_token_accuracy": 0.5702525054415067, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008568368439842914, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 2.2565320665083135e-05, | |
| "loss": 1.6569, | |
| "mean_token_accuracy": 0.6329187631607056, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01285255265976437, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.44418052256532e-05, | |
| "loss": 1.4104, | |
| "mean_token_accuracy": 0.657142640153567, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.017136736879685827, | |
| "grad_norm": 2.375, | |
| "learning_rate": 4.631828978622328e-05, | |
| "loss": 1.4205, | |
| "mean_token_accuracy": 0.657102554043134, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.021420921099607283, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.819477434679335e-05, | |
| "loss": 1.4479, | |
| "mean_token_accuracy": 0.6437717239061992, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02570510531952874, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 7.007125890736342e-05, | |
| "loss": 1.4744, | |
| "mean_token_accuracy": 0.6392371306816736, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.029989289539450195, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 8.194774346793349e-05, | |
| "loss": 1.4946, | |
| "mean_token_accuracy": 0.6394953320423762, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.034273473759371655, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 9.382422802850355e-05, | |
| "loss": 1.5123, | |
| "mean_token_accuracy": 0.6348034431536992, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03855765797929311, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 0.00010570071258907364, | |
| "loss": 1.5446, | |
| "mean_token_accuracy": 0.6331378062566121, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.042841842199214566, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 0.00011757719714964371, | |
| "loss": 1.6083, | |
| "mean_token_accuracy": 0.6170119682947794, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.047126026419136026, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 0.00012945368171021377, | |
| "loss": 1.6014, | |
| "mean_token_accuracy": 0.6203826546669007, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.05141021063905748, | |
| "grad_norm": 6.375, | |
| "learning_rate": 0.00014133016627078385, | |
| "loss": 1.65, | |
| "mean_token_accuracy": 0.6120437135299047, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.05569439485897894, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 0.00015320665083135392, | |
| "loss": 1.7805, | |
| "mean_token_accuracy": 0.5932602127393086, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05997857907890039, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 0.000165083135391924, | |
| "loss": 1.6786, | |
| "mean_token_accuracy": 0.605377835035324, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06426276329882184, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 0.00017695961995249407, | |
| "loss": 1.6794, | |
| "mean_token_accuracy": 0.6040138930082322, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06854694751874331, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 0.00018883610451306412, | |
| "loss": 1.6891, | |
| "mean_token_accuracy": 0.6048624157905579, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.07283113173866476, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 0.00020071258907363422, | |
| "loss": 1.7496, | |
| "mean_token_accuracy": 0.5943053344885508, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.07711531595858621, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 0.00021258907363420426, | |
| "loss": 1.8166, | |
| "mean_token_accuracy": 0.5864472662409147, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.08139950017850768, | |
| "grad_norm": 1.5, | |
| "learning_rate": 0.00022446555819477434, | |
| "loss": 1.804, | |
| "mean_token_accuracy": 0.5849504172801971, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08568368439842913, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 0.00023634204275534444, | |
| "loss": 1.8061, | |
| "mean_token_accuracy": 0.5850104331970215, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08996786861835059, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 0.0002482185273159145, | |
| "loss": 1.8503, | |
| "mean_token_accuracy": 0.5780546754598618, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.09425205283827205, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 0.00026009501187648456, | |
| "loss": 1.7982, | |
| "mean_token_accuracy": 0.5852496673663458, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0985362370581935, | |
| "grad_norm": 1.25, | |
| "learning_rate": 0.00027197149643705463, | |
| "loss": 1.8589, | |
| "mean_token_accuracy": 0.5725395048658053, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.10282042127811496, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 0.0002838479809976247, | |
| "loss": 1.8616, | |
| "mean_token_accuracy": 0.5743008524179458, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.10710460549803641, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 0.0002957244655581948, | |
| "loss": 1.8569, | |
| "mean_token_accuracy": 0.5772680620352427, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.11138878971795788, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 0.00030760095011876486, | |
| "loss": 1.883, | |
| "mean_token_accuracy": 0.5712258398532868, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.11567297393787933, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 0.00031947743467933493, | |
| "loss": 1.9089, | |
| "mean_token_accuracy": 0.5683296079436938, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.11995715815780078, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 0.000331353919239905, | |
| "loss": 1.8845, | |
| "mean_token_accuracy": 0.5682273174325625, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.12424134237772225, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 0.000343230403800475, | |
| "loss": 1.8897, | |
| "mean_token_accuracy": 0.5696620404720306, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.12852552659764369, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 0.00035510688836104515, | |
| "loss": 1.8809, | |
| "mean_token_accuracy": 0.571748511493206, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.13280971081756515, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 0.0003669833729216152, | |
| "loss": 1.9614, | |
| "mean_token_accuracy": 0.566138303776582, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.13709389503748662, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 0.00037885985748218525, | |
| "loss": 1.969, | |
| "mean_token_accuracy": 0.5570171405871709, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.14137807925740806, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 0.0003907363420427554, | |
| "loss": 1.902, | |
| "mean_token_accuracy": 0.5694692318638166, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.14566226347732952, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 0.00040261282660332545, | |
| "loss": 1.9013, | |
| "mean_token_accuracy": 0.5666690587997436, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.149946447697251, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 0.00041448931116389547, | |
| "loss": 1.9115, | |
| "mean_token_accuracy": 0.5653235624233882, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.15423063191717243, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 0.00042636579572446554, | |
| "loss": 1.9273, | |
| "mean_token_accuracy": 0.5625237961610158, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1585148161370939, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 0.00043824228028503567, | |
| "loss": 1.9705, | |
| "mean_token_accuracy": 0.5571655298272769, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.16279900035701536, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 0.0004501187648456057, | |
| "loss": 1.9421, | |
| "mean_token_accuracy": 0.5626261870066325, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1670831845769368, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 0.00046199524940617576, | |
| "loss": 1.9847, | |
| "mean_token_accuracy": 0.5572134067614873, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.17136736879685827, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 0.0004738717339667459, | |
| "loss": 1.9701, | |
| "mean_token_accuracy": 0.5576820741097133, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.17565155301677973, | |
| "grad_norm": 0.875, | |
| "learning_rate": 0.0004857482185273159, | |
| "loss": 1.9652, | |
| "mean_token_accuracy": 0.5562737271189689, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.17993573723670117, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 0.000497624703087886, | |
| "loss": 1.9838, | |
| "mean_token_accuracy": 0.553861757616202, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.18421992145662264, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 0.0004999981769212751, | |
| "loss": 1.9842, | |
| "mean_token_accuracy": 0.5521314447124799, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1885041056765441, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 0.0004999907707095249, | |
| "loss": 1.9573, | |
| "mean_token_accuracy": 0.5590350007017454, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.19278828989646554, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 0.0004999776675909755, | |
| "loss": 1.9179, | |
| "mean_token_accuracy": 0.5650409559408823, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.197072474116387, | |
| "grad_norm": 0.8203125, | |
| "learning_rate": 0.000499958867864227, | |
| "loss": 1.9485, | |
| "mean_token_accuracy": 0.5609820206960042, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.20135665833630847, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 0.0004999343719576963, | |
| "loss": 1.9483, | |
| "mean_token_accuracy": 0.5612422108650208, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2056408425562299, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 0.0004999041804296074, | |
| "loss": 1.9683, | |
| "mean_token_accuracy": 0.561284634967645, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.20992502677615138, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0004998682939679794, | |
| "loss": 1.9333, | |
| "mean_token_accuracy": 0.56248503079017, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.21420921099607282, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0004998267133906095, | |
| "loss": 1.8902, | |
| "mean_token_accuracy": 0.568443168203036, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.21849339521599428, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 0.0004997794396450555, | |
| "loss": 1.9334, | |
| "mean_token_accuracy": 0.5650926142930984, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.22277757943591575, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 0.0004997264738086136, | |
| "loss": 1.9271, | |
| "mean_token_accuracy": 0.5642434308926264, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2270617636558372, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 0.0004996678170882941, | |
| "loss": 1.9352, | |
| "mean_token_accuracy": 0.5623646408319474, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.23134594787575866, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 0.000499603470820794, | |
| "loss": 1.9214, | |
| "mean_token_accuracy": 0.5678030242522557, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.23563013209568012, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 0.0004995334364724658, | |
| "loss": 1.9247, | |
| "mean_token_accuracy": 0.5662606621781985, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.23991431631560156, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.0004994577156392854, | |
| "loss": 1.9279, | |
| "mean_token_accuracy": 0.5633541295925776, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.24419850053552303, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0004993763100468144, | |
| "loss": 1.8886, | |
| "mean_token_accuracy": 0.5679133623838425, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.2484826847554445, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0004992892215501618, | |
| "loss": 1.8938, | |
| "mean_token_accuracy": 0.5673948327700297, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.25276686897536593, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0004991964521339408, | |
| "loss": 1.8795, | |
| "mean_token_accuracy": 0.5750111728906632, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.25705105319528737, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.0004990980039122245, | |
| "loss": 1.8579, | |
| "mean_token_accuracy": 0.5755017310380935, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.26133523741520887, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0004989938791284971, | |
| "loss": 1.8579, | |
| "mean_token_accuracy": 0.5754131337006887, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.2656194216351303, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.0004988840801556029, | |
| "loss": 1.9225, | |
| "mean_token_accuracy": 0.5679664716124535, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.26990360585505174, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.0004987686094956922, | |
| "loss": 1.8843, | |
| "mean_token_accuracy": 0.5681375061472257, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.27418779007497324, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0004986474697801647, | |
| "loss": 1.8747, | |
| "mean_token_accuracy": 0.5750736912091573, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.2784719742948947, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.000498520663769609, | |
| "loss": 1.8875, | |
| "mean_token_accuracy": 0.568752312163512, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2827561585148161, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.0004983881943537396, | |
| "loss": 1.8393, | |
| "mean_token_accuracy": 0.5768392990032832, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2870403427347376, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 0.0004982500645513319, | |
| "loss": 1.8389, | |
| "mean_token_accuracy": 0.5770889202753703, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.29132452695465905, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 0.0004981062775101524, | |
| "loss": 1.812, | |
| "mean_token_accuracy": 0.5807933017611504, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2956087111745805, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.0004979568365068878, | |
| "loss": 1.8463, | |
| "mean_token_accuracy": 0.571879476805528, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.299892895394502, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.0004978017449470692, | |
| "loss": 1.8659, | |
| "mean_token_accuracy": 0.575709896783034, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3041770796144234, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0004976410063649963, | |
| "loss": 1.8381, | |
| "mean_token_accuracy": 0.577228785554568, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.30846126383434486, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.0004974746244236546, | |
| "loss": 1.7498, | |
| "mean_token_accuracy": 0.5971198469400406, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.31274544805426635, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.0004973026029146343, | |
| "loss": 1.797, | |
| "mean_token_accuracy": 0.5817218641440074, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3170296322741878, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0004971249457580418, | |
| "loss": 1.8144, | |
| "mean_token_accuracy": 0.5801170518000921, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.32131381649410923, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 0.0004969416570024118, | |
| "loss": 1.8323, | |
| "mean_token_accuracy": 0.5770084033409755, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3255980007140307, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.0004967527408246142, | |
| "loss": 1.8112, | |
| "mean_token_accuracy": 0.5806623538335164, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.32988218493395216, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.0004965582015297593, | |
| "loss": 1.7814, | |
| "mean_token_accuracy": 0.590585180123647, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.3341663691538736, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.0004963580435510999, | |
| "loss": 1.7749, | |
| "mean_token_accuracy": 0.5853669941425323, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.3384505533737951, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 0.0004961522714499296, | |
| "loss": 1.8028, | |
| "mean_token_accuracy": 0.5840177754561107, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.34273473759371653, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0004959408899154796, | |
| "loss": 1.8328, | |
| "mean_token_accuracy": 0.5787926654020945, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.34701892181363797, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 0.0004957239037648111, | |
| "loss": 1.8215, | |
| "mean_token_accuracy": 0.5796164035797119, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.35130310603355946, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 0.0004955013179427064, | |
| "loss": 1.813, | |
| "mean_token_accuracy": 0.5805202250679334, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.3555872902534809, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.0004952731375215554, | |
| "loss": 1.7699, | |
| "mean_token_accuracy": 0.5885967900355656, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.35987147447340234, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.0004950393677012406, | |
| "loss": 1.7783, | |
| "mean_token_accuracy": 0.5840276171763737, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.36415565869332384, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0004948000138090178, | |
| "loss": 1.758, | |
| "mean_token_accuracy": 0.5915057013432184, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3684398429132453, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.000494555081299396, | |
| "loss": 1.7502, | |
| "mean_token_accuracy": 0.5921384165684382, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.3727240271331667, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0004943045757540116, | |
| "loss": 1.7696, | |
| "mean_token_accuracy": 0.5876839280128479, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.3770082113530882, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.0004940485028815028, | |
| "loss": 1.7923, | |
| "mean_token_accuracy": 0.5851048608620961, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.38129239557300965, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.0004937868685173779, | |
| "loss": 1.7734, | |
| "mean_token_accuracy": 0.5879452233513196, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.3855765797929311, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0004935196786238832, | |
| "loss": 1.7807, | |
| "mean_token_accuracy": 0.5856667757034302, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3898607640128526, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.0004932469392898675, | |
| "loss": 1.7573, | |
| "mean_token_accuracy": 0.589621431628863, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.394144948232774, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.0004929686567306424, | |
| "loss": 1.7764, | |
| "mean_token_accuracy": 0.5870031158129374, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.39842913245269546, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0004926848372878412, | |
| "loss": 1.7499, | |
| "mean_token_accuracy": 0.5911330193281173, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.40271331667261695, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0004923954874292743, | |
| "loss": 1.7605, | |
| "mean_token_accuracy": 0.5885690222183864, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.4069975008925384, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.0004921006137487819, | |
| "loss": 1.7249, | |
| "mean_token_accuracy": 0.5983182614048322, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.4112816851124598, | |
| "grad_norm": 0.5, | |
| "learning_rate": 0.0004918002229660836, | |
| "loss": 1.7411, | |
| "mean_token_accuracy": 0.5920816282431285, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.41556586933238127, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.000491494321926625, | |
| "loss": 1.7022, | |
| "mean_token_accuracy": 0.6039901932080587, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.41985005355230276, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.0004911829176014227, | |
| "loss": 1.7546, | |
| "mean_token_accuracy": 0.5963041971127192, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.4241342377722242, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 0.0004908660170869041, | |
| "loss": 1.7394, | |
| "mean_token_accuracy": 0.5946206981937091, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.42841842199214564, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.0004905436276047468, | |
| "loss": 1.7065, | |
| "mean_token_accuracy": 0.5999080528815587, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.43270260621206713, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.0004902157565017131, | |
| "loss": 1.7317, | |
| "mean_token_accuracy": 0.5951716423034668, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.43698679043198857, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.0004898824112494834, | |
| "loss": 1.7353, | |
| "mean_token_accuracy": 0.5931165516376495, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.44127097465191, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.0004895435994444855, | |
| "loss": 1.7234, | |
| "mean_token_accuracy": 0.592176150282224, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.4455551588718315, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 0.0004891993288077216, | |
| "loss": 1.7373, | |
| "mean_token_accuracy": 0.5937283883492152, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.44983934309175294, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.0004888496071845921, | |
| "loss": 1.7339, | |
| "mean_token_accuracy": 0.5963114351034164, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.4541235273116744, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.0004884944425447174, | |
| "loss": 1.7075, | |
| "mean_token_accuracy": 0.5985465884208679, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.4584077115315959, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 0.00048813384298175533, | |
| "loss": 1.7108, | |
| "mean_token_accuracy": 0.5949873934189479, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.4626918957515173, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.000487767816713218, | |
| "loss": 1.7319, | |
| "mean_token_accuracy": 0.5978627324104309, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.46697607997143875, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00048739637208028343, | |
| "loss": 1.7215, | |
| "mean_token_accuracy": 0.5949975033601125, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.47126026419136025, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 0.0004870195175476059, | |
| "loss": 1.6915, | |
| "mean_token_accuracy": 0.6006935626268387, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.4755444484112817, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.00048663726170312304, | |
| "loss": 1.7224, | |
| "mean_token_accuracy": 0.5965896293520927, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.4798286326312031, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.0004862496132578601, | |
| "loss": 1.7188, | |
| "mean_token_accuracy": 0.5954587419827779, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.4841128168511246, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.0004858565810457315, | |
| "loss": 1.7029, | |
| "mean_token_accuracy": 0.5984790166219075, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.48839700107104606, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.00048545817402333944, | |
| "loss": 1.6707, | |
| "mean_token_accuracy": 0.6039733906586965, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.4926811852909675, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 0.00048505440126976975, | |
| "loss": 1.7063, | |
| "mean_token_accuracy": 0.5990351974964142, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.496965369510889, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 0.000484645271986385, | |
| "loss": 1.7019, | |
| "mean_token_accuracy": 0.5967398678263028, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.5012495537308104, | |
| "grad_norm": 0.4453125, | |
| "learning_rate": 0.00048423079549661513, | |
| "loss": 1.7185, | |
| "mean_token_accuracy": 0.5945661654074986, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.5055337379507319, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.00048381098124574453, | |
| "loss": 1.6953, | |
| "mean_token_accuracy": 0.5991637865702312, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.5098179221706534, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 0.000483385838800697, | |
| "loss": 1.6724, | |
| "mean_token_accuracy": 0.6055593649546306, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.5141021063905747, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.0004829553778498177, | |
| "loss": 1.69, | |
| "mean_token_accuracy": 0.6024374475081762, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5183862906104962, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.0004825196082026525, | |
| "loss": 1.6896, | |
| "mean_token_accuracy": 0.6015072325865428, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.5226704748304177, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.00048207853978972425, | |
| "loss": 1.6911, | |
| "mean_token_accuracy": 0.5975394507249197, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.5269546590503391, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.00048163218266230657, | |
| "loss": 1.6506, | |
| "mean_token_accuracy": 0.6080829570690791, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.5312388432702606, | |
| "grad_norm": 0.482421875, | |
| "learning_rate": 0.00048118054699219486, | |
| "loss": 1.7084, | |
| "mean_token_accuracy": 0.5956538558006287, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5355230274901821, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.00048072364307147434, | |
| "loss": 1.6769, | |
| "mean_token_accuracy": 0.6048657476902009, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5398072117101035, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00048026148131228544, | |
| "loss": 1.6391, | |
| "mean_token_accuracy": 0.6106082250674566, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.544091395930025, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 0.00047979407224658704, | |
| "loss": 1.652, | |
| "mean_token_accuracy": 0.609614963332812, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.5483755801499465, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.0004793214265259158, | |
| "loss": 1.6686, | |
| "mean_token_accuracy": 0.6056814392407736, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.5526597643698679, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.0004788435549211439, | |
| "loss": 1.7061, | |
| "mean_token_accuracy": 0.600483645995458, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.5569439485897894, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 0.00047836046832223336, | |
| "loss": 1.6521, | |
| "mean_token_accuracy": 0.607241137822469, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5612281328097108, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 0.00047787217773798775, | |
| "loss": 1.6408, | |
| "mean_token_accuracy": 0.6109602769215902, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5655123170296322, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.00047737869429580177, | |
| "loss": 1.6651, | |
| "mean_token_accuracy": 0.6036148915688196, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.5697965012495537, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 0.0004768800292414073, | |
| "loss": 1.6612, | |
| "mean_token_accuracy": 0.6085895299911499, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.5740806854694752, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.00047637619393861726, | |
| "loss": 1.6645, | |
| "mean_token_accuracy": 0.6032974988222122, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.5783648696893966, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.00047586719986906644, | |
| "loss": 1.6206, | |
| "mean_token_accuracy": 0.6132790346940359, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.5826490539093181, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00047535305863195023, | |
| "loss": 1.6523, | |
| "mean_token_accuracy": 0.6070671379566193, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.5869332381292396, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 0.00047483378194376004, | |
| "loss": 1.6394, | |
| "mean_token_accuracy": 0.608396037419637, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.591217422349161, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00047430938163801623, | |
| "loss": 1.6279, | |
| "mean_token_accuracy": 0.6083019236723582, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.5955016065690825, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 0.00047377986966499867, | |
| "loss": 1.6447, | |
| "mean_token_accuracy": 0.6057626972595851, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.599785790789004, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 0.00047324525809147437, | |
| "loss": 1.6669, | |
| "mean_token_accuracy": 0.6031167497237523, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6040699750089253, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 0.0004727055591004221, | |
| "loss": 1.6684, | |
| "mean_token_accuracy": 0.6013229255874951, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.6083541592288468, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.00047216078499075556, | |
| "loss": 1.6047, | |
| "mean_token_accuracy": 0.613938628633817, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.6126383434487683, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.0004716109481770422, | |
| "loss": 1.646, | |
| "mean_token_accuracy": 0.6082295358181, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.6169225276686897, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 0.000471056061189221, | |
| "loss": 1.5866, | |
| "mean_token_accuracy": 0.6149321556091308, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.6212067118886112, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.0004704961366723165, | |
| "loss": 1.6708, | |
| "mean_token_accuracy": 0.607517758011818, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.6254908961085327, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.000469931187386151, | |
| "loss": 1.6741, | |
| "mean_token_accuracy": 0.607824856042862, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.6297750803284541, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 0.0004693612262050535, | |
| "loss": 1.6384, | |
| "mean_token_accuracy": 0.6065444548924764, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.6340592645483756, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 0.0004687862661175664, | |
| "loss": 1.6302, | |
| "mean_token_accuracy": 0.611606694261233, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.6383434487682971, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 0.0004682063202261495, | |
| "loss": 1.608, | |
| "mean_token_accuracy": 0.617574452360471, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.6426276329882185, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.0004676214017468815, | |
| "loss": 1.6193, | |
| "mean_token_accuracy": 0.6125218907992045, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.64691181720814, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 0.00046703152400915873, | |
| "loss": 1.6094, | |
| "mean_token_accuracy": 0.6132543057203292, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.6511960014280614, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 0.0004664367004553914, | |
| "loss": 1.644, | |
| "mean_token_accuracy": 0.6055496126413346, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.6554801856479828, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 0.0004658369446406974, | |
| "loss": 1.6027, | |
| "mean_token_accuracy": 0.6137914508581161, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.6597643698679043, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.000465232270232593, | |
| "loss": 1.5611, | |
| "mean_token_accuracy": 0.6206978172063827, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.6640485540878258, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 0.0004646226910106821, | |
| "loss": 1.5668, | |
| "mean_token_accuracy": 0.6236391812562943, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.6683327383077472, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.0004640082208663415, | |
| "loss": 1.6394, | |
| "mean_token_accuracy": 0.6060982346534729, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.6726169225276687, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 0.0004633888738024048, | |
| "loss": 1.5921, | |
| "mean_token_accuracy": 0.6156363248825073, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.6769011067475902, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 0.00046276466393284295, | |
| "loss": 1.6083, | |
| "mean_token_accuracy": 0.6118316878875096, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.6811852909675116, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.00046213560548244296, | |
| "loss": 1.5921, | |
| "mean_token_accuracy": 0.6157896757125855, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.6854694751874331, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.0004615017127864834, | |
| "loss": 1.612, | |
| "mean_token_accuracy": 0.6124424457550048, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6897536594073546, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 0.00046086300029040805, | |
| "loss": 1.5575, | |
| "mean_token_accuracy": 0.620477185646693, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.6940378436272759, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.0004602194825494965, | |
| "loss": 1.5634, | |
| "mean_token_accuracy": 0.623723766207695, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.6983220278471974, | |
| "grad_norm": 0.419921875, | |
| "learning_rate": 0.00045957117422853257, | |
| "loss": 1.5947, | |
| "mean_token_accuracy": 0.6120685537656149, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.7026062120671189, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 0.0004589180901014699, | |
| "loss": 1.5554, | |
| "mean_token_accuracy": 0.6264029294252396, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.7068903962870403, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.0004582602450510955, | |
| "loss": 1.6142, | |
| "mean_token_accuracy": 0.6149922668933868, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7111745805069618, | |
| "grad_norm": 0.375, | |
| "learning_rate": 0.00045759765406869077, | |
| "loss": 1.5881, | |
| "mean_token_accuracy": 0.6184295862913132, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.7154587647268833, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00045693033225368917, | |
| "loss": 1.5637, | |
| "mean_token_accuracy": 0.618203051884969, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.7197429489468047, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.0004562582948133331, | |
| "loss": 1.5764, | |
| "mean_token_accuracy": 0.6192536850770315, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.7240271331667262, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 0.0004555815570623264, | |
| "loss": 1.6028, | |
| "mean_token_accuracy": 0.61400941212972, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.7283113173866477, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 0.000454900134422486, | |
| "loss": 1.5918, | |
| "mean_token_accuracy": 0.6173286845286687, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.732595501606569, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 0.0004542140424223904, | |
| "loss": 1.5623, | |
| "mean_token_accuracy": 0.6222762515147527, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.7368796858264905, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.0004535232966970253, | |
| "loss": 1.5609, | |
| "mean_token_accuracy": 0.6209416131178538, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.741163870046412, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 0.0004528279129874281, | |
| "loss": 1.6066, | |
| "mean_token_accuracy": 0.6145075420538585, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.7454480542663334, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 0.00045212790714032843, | |
| "loss": 1.5767, | |
| "mean_token_accuracy": 0.6191084752480189, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7497322384862549, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.0004514232951077875, | |
| "loss": 1.5814, | |
| "mean_token_accuracy": 0.612170214454333, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.7540164227061764, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.00045071409294683443, | |
| "loss": 1.5696, | |
| "mean_token_accuracy": 0.6165617436170578, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.7583006069260978, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00045000031681910024, | |
| "loss": 1.5513, | |
| "mean_token_accuracy": 0.6239033748706182, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.7625847911460193, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.0004492819829904498, | |
| "loss": 1.6032, | |
| "mean_token_accuracy": 0.614315361281236, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.7668689753659408, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 0.0004485591078306109, | |
| "loss": 1.5622, | |
| "mean_token_accuracy": 0.6223886062701544, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.7711531595858622, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.0004478317078128013, | |
| "loss": 1.5891, | |
| "mean_token_accuracy": 0.6170624762773513, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7754373438057837, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 0.0004470997995133534, | |
| "loss": 1.5589, | |
| "mean_token_accuracy": 0.6224645217259724, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.7797215280257052, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.0004463633996113365, | |
| "loss": 1.5597, | |
| "mean_token_accuracy": 0.6208334664503733, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.7840057122456265, | |
| "grad_norm": 0.41015625, | |
| "learning_rate": 0.00044562252488817644, | |
| "loss": 1.5748, | |
| "mean_token_accuracy": 0.6175821512937546, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.788289896465548, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 0.00044487719222727353, | |
| "loss": 1.5298, | |
| "mean_token_accuracy": 0.6264814734458923, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.7925740806854695, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 0.0004441274186136176, | |
| "loss": 1.5153, | |
| "mean_token_accuracy": 0.6300516416629155, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.7968582649053909, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.0004433732211334011, | |
| "loss": 1.556, | |
| "mean_token_accuracy": 0.6228625237941742, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.8011424491253124, | |
| "grad_norm": 0.375, | |
| "learning_rate": 0.0004426146169736295, | |
| "loss": 1.5473, | |
| "mean_token_accuracy": 0.6202453782161077, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.8054266333452339, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 0.0004418516234217297, | |
| "loss": 1.5485, | |
| "mean_token_accuracy": 0.6223239193360011, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.8097108175651553, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.00044108425786515626, | |
| "loss": 1.5764, | |
| "mean_token_accuracy": 0.6173768778642018, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.8139950017850768, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.00044031253779099505, | |
| "loss": 1.5518, | |
| "mean_token_accuracy": 0.6269906918207805, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8182791860049982, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 0.00043953648078556465, | |
| "loss": 1.53, | |
| "mean_token_accuracy": 0.6300149967273077, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.8225633702249197, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 0.0004387561045340155, | |
| "loss": 1.5431, | |
| "mean_token_accuracy": 0.6231356263160706, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.8268475544448411, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 0.00043797142681992744, | |
| "loss": 1.5386, | |
| "mean_token_accuracy": 0.626959690451622, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.8311317386647625, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 0.0004371824655249037, | |
| "loss": 1.5707, | |
| "mean_token_accuracy": 0.6185725013415019, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.835415922884684, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 0.0004363892386281639, | |
| "loss": 1.5308, | |
| "mean_token_accuracy": 0.6247758358716965, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.8397001071046055, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 0.0004355917642061342, | |
| "loss": 1.5746, | |
| "mean_token_accuracy": 0.6213243504365286, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.8439842913245269, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 0.0004347900604320353, | |
| "loss": 1.5369, | |
| "mean_token_accuracy": 0.6267731686433157, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.8482684755444484, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 0.0004339841455754684, | |
| "loss": 1.5302, | |
| "mean_token_accuracy": 0.6294100970029831, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.8525526597643699, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 0.0004331740380019988, | |
| "loss": 1.5668, | |
| "mean_token_accuracy": 0.6167016873757044, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.8568368439842913, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 0.0004323597561727374, | |
| "loss": 1.5192, | |
| "mean_token_accuracy": 0.6278371940056483, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8611210282042128, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 0.0004315413186439201, | |
| "loss": 1.5214, | |
| "mean_token_accuracy": 0.6299047251542409, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.8654052124241343, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 0.0004307187440664846, | |
| "loss": 1.5366, | |
| "mean_token_accuracy": 0.6220711261034012, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.8696893966440556, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.00042989205118564575, | |
| "loss": 1.5766, | |
| "mean_token_accuracy": 0.6220267007748286, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.8739735808639771, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 0.00042906125884046827, | |
| "loss": 1.5452, | |
| "mean_token_accuracy": 0.628257621328036, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.8782577650838986, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 0.00042822638596343735, | |
| "loss": 1.5187, | |
| "mean_token_accuracy": 0.6320155799388886, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.88254194930382, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.0004273874515800271, | |
| "loss": 1.4877, | |
| "mean_token_accuracy": 0.6335129290819168, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.8868261335237415, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.0004265444748082674, | |
| "loss": 1.5414, | |
| "mean_token_accuracy": 0.6232473621765773, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.891110317743663, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 0.00042569747485830784, | |
| "loss": 1.5354, | |
| "mean_token_accuracy": 0.6265609403451283, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.8953945019635844, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 0.00042484647103198007, | |
| "loss": 1.5011, | |
| "mean_token_accuracy": 0.632336409886678, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.8996786861835059, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 0.0004239914827223579, | |
| "loss": 1.5261, | |
| "mean_token_accuracy": 0.6247933119535446, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9039628704034274, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 0.0004231325294133155, | |
| "loss": 1.5346, | |
| "mean_token_accuracy": 0.6297541747490565, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.9082470546233488, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.0004222696306790833, | |
| "loss": 1.5457, | |
| "mean_token_accuracy": 0.6180067032575607, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.9125312388432703, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.000421402806183802, | |
| "loss": 1.4974, | |
| "mean_token_accuracy": 0.6331047038237254, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.9168154230631917, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 0.00042053207568107414, | |
| "loss": 1.5353, | |
| "mean_token_accuracy": 0.6293119370937348, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.9210996072831131, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 0.0004196574590135144, | |
| "loss": 1.5305, | |
| "mean_token_accuracy": 0.6253014832735062, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.9253837915030346, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 0.0004187789761122972, | |
| "loss": 1.5138, | |
| "mean_token_accuracy": 0.6260040392478307, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.9296679757229561, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 0.0004178966469967024, | |
| "loss": 1.5205, | |
| "mean_token_accuracy": 0.6270245303710301, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.9339521599428775, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 0.0004170104917736591, | |
| "loss": 1.5428, | |
| "mean_token_accuracy": 0.6261491070191065, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.938236344162799, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 0.00041612053063728793, | |
| "loss": 1.4962, | |
| "mean_token_accuracy": 0.6338411172231039, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.9425205283827205, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 0.00041522678386844003, | |
| "loss": 1.5215, | |
| "mean_token_accuracy": 0.6297219822804133, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9468047126026419, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 0.0004143292718342355, | |
| "loss": 1.5052, | |
| "mean_token_accuracy": 0.6313861280679702, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.9510888968225634, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.0004134280149875991, | |
| "loss": 1.5522, | |
| "mean_token_accuracy": 0.6244584927956264, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.9553730810424849, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.000412523033866794, | |
| "loss": 1.5116, | |
| "mean_token_accuracy": 0.6311194211244583, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.9596572652624062, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 0.000411614349094954, | |
| "loss": 1.5068, | |
| "mean_token_accuracy": 0.6359375943740209, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.9639414494823277, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.00041070198137961334, | |
| "loss": 1.5228, | |
| "mean_token_accuracy": 0.630420845746994, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.9682256337022492, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 0.00040978595151223496, | |
| "loss": 1.5304, | |
| "mean_token_accuracy": 0.6266944895188014, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.9725098179221706, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.00040886628036773665, | |
| "loss": 1.4539, | |
| "mean_token_accuracy": 0.6405547618865967, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.9767940021420921, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 0.0004079429889040153, | |
| "loss": 1.4749, | |
| "mean_token_accuracy": 0.6339232285817464, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.9810781863620136, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 0.0004070160981614693, | |
| "loss": 1.4821, | |
| "mean_token_accuracy": 0.6370660841464997, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.985362370581935, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 0.00040608562926251914, | |
| "loss": 1.5096, | |
| "mean_token_accuracy": 0.6285079121589661, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.9896465548018565, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.000405151603411126, | |
| "loss": 1.5167, | |
| "mean_token_accuracy": 0.6253816932439804, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.993930739021778, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.0004042140418923085, | |
| "loss": 1.487, | |
| "mean_token_accuracy": 0.6365177949269613, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.9982149232416994, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.0004032729660716579, | |
| "loss": 1.4693, | |
| "mean_token_accuracy": 0.6372685492038727, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.0021420921099606, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 0.00040232839739485067, | |
| "loss": 1.4216, | |
| "mean_token_accuracy": 0.6438040440732783, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.0064262763298821, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 0.0004013803573871605, | |
| "loss": 1.3392, | |
| "mean_token_accuracy": 0.6590171555678049, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.0107104605498036, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 0.00040042886765296714, | |
| "loss": 1.3092, | |
| "mean_token_accuracy": 0.6659611910581589, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.0149946447697251, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 0.0003994739498752645, | |
| "loss": 1.338, | |
| "mean_token_accuracy": 0.6604650288820266, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.0192788289896466, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 0.0003985156258151662, | |
| "loss": 1.2883, | |
| "mean_token_accuracy": 0.6699223518371582, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.023563013209568, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 0.00039755391731140986, | |
| "loss": 1.3335, | |
| "mean_token_accuracy": 0.6621675411860148, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.0278471974294894, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.00039658884627985947, | |
| "loss": 1.3185, | |
| "mean_token_accuracy": 0.6633899579445521, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.0321313816494109, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.00039562043471300573, | |
| "loss": 1.3408, | |
| "mean_token_accuracy": 0.6567703276872635, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.0364155658693324, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 0.00039464870467946516, | |
| "loss": 1.3053, | |
| "mean_token_accuracy": 0.6645698845386505, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.0406997500892539, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 0.00039367367832347707, | |
| "loss": 1.3634, | |
| "mean_token_accuracy": 0.6515278299649556, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.0449839343091754, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 0.00039269537786439866, | |
| "loss": 1.3179, | |
| "mean_token_accuracy": 0.6596842000881831, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.0492681185290968, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 0.0003917138255961993, | |
| "loss": 1.3042, | |
| "mean_token_accuracy": 0.6642681717872619, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.0535523027490181, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.0003907290438869517, | |
| "loss": 1.328, | |
| "mean_token_accuracy": 0.6599580854177475, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.0578364869689396, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.00038974105517832315, | |
| "loss": 1.304, | |
| "mean_token_accuracy": 0.6646572331587474, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.062120671188861, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.00038874988198506287, | |
| "loss": 1.3355, | |
| "mean_token_accuracy": 0.6583087474107743, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.0664048554087826, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 0.00038775554689449013, | |
| "loss": 1.3209, | |
| "mean_token_accuracy": 0.6637792021036149, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.070689039628704, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 0.00038675807256597863, | |
| "loss": 1.3437, | |
| "mean_token_accuracy": 0.6541990071535111, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.0749732238486256, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 0.0003857574817304407, | |
| "loss": 1.3368, | |
| "mean_token_accuracy": 0.6589954853057861, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.0792574080685469, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.0003847537971898093, | |
| "loss": 1.3431, | |
| "mean_token_accuracy": 0.6575679163138072, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.0835415922884684, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 0.0003837470418165176, | |
| "loss": 1.3559, | |
| "mean_token_accuracy": 0.6553231467803319, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.0878257765083899, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 0.000382737238552979, | |
| "loss": 1.3401, | |
| "mean_token_accuracy": 0.6568175663550695, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.0921099607283113, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.00038172441041106316, | |
| "loss": 1.3171, | |
| "mean_token_accuracy": 0.6621425936619441, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.0963941449482328, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 0.0003807085804715723, | |
| "loss": 1.3206, | |
| "mean_token_accuracy": 0.6625715295473734, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.1006783291681543, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.000379689771883715, | |
| "loss": 1.3234, | |
| "mean_token_accuracy": 0.660184133052826, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.1049625133880756, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 0.00037866800786457864, | |
| "loss": 1.3085, | |
| "mean_token_accuracy": 0.6675328105688095, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.109246697607997, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.00037764331169860046, | |
| "loss": 1.3138, | |
| "mean_token_accuracy": 0.6605868438879648, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.1135308818279186, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 0.0003766157067370366, | |
| "loss": 1.3211, | |
| "mean_token_accuracy": 0.6628186653057734, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.11781506604784, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.00037558521639743036, | |
| "loss": 1.3113, | |
| "mean_token_accuracy": 0.6645319203535716, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.1220992502677616, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.0003745518641630785, | |
| "loss": 1.3303, | |
| "mean_token_accuracy": 0.659632471203804, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.1263834344876829, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 0.0003735156735824957, | |
| "loss": 1.3333, | |
| "mean_token_accuracy": 0.6590901345014573, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.1306676187076043, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 0.0003724766682688784, | |
| "loss": 1.3414, | |
| "mean_token_accuracy": 0.6546030879020691, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.1349518029275258, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.00037143487189956635, | |
| "loss": 1.3257, | |
| "mean_token_accuracy": 0.6572333713372548, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.1392359871474473, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 0.0003703903082155035, | |
| "loss": 1.3372, | |
| "mean_token_accuracy": 0.6556382944186528, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.1435201713673688, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0003693430010206962, | |
| "loss": 1.321, | |
| "mean_token_accuracy": 0.6620635330677033, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.1478043555872903, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0003682929741816717, | |
| "loss": 1.3186, | |
| "mean_token_accuracy": 0.6653014043966929, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.1520885398072118, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 0.00036724025162693317, | |
| "loss": 1.3106, | |
| "mean_token_accuracy": 0.666445321838061, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.156372724027133, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 0.00036618485734641584, | |
| "loss": 1.3426, | |
| "mean_token_accuracy": 0.6579491098721822, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.1606569082470546, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 0.0003651268153909386, | |
| "loss": 1.3593, | |
| "mean_token_accuracy": 0.6558250943819682, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.164941092466976, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 0.00036406614987165737, | |
| "loss": 1.3385, | |
| "mean_token_accuracy": 0.6573931157588959, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.1692252766868976, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 0.00036300288495951487, | |
| "loss": 1.303, | |
| "mean_token_accuracy": 0.6656403839588165, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.173509460906819, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.0003619370448846901, | |
| "loss": 1.321, | |
| "mean_token_accuracy": 0.6617445478836695, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.1777936451267403, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 0.000360868653936046, | |
| "loss": 1.3403, | |
| "mean_token_accuracy": 0.6576227575540543, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.1820778293466618, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 0.00035979773646057603, | |
| "loss": 1.3161, | |
| "mean_token_accuracy": 0.6614106744527817, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.1863620135665833, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0003587243168628491, | |
| "loss": 1.3549, | |
| "mean_token_accuracy": 0.6531448741753896, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.1906461977865048, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 0.00035764841960445433, | |
| "loss": 1.3016, | |
| "mean_token_accuracy": 0.6694233824809392, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.1949303820064263, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.0003565700692034421, | |
| "loss": 1.3076, | |
| "mean_token_accuracy": 0.6626476556062698, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.1992145662263478, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.00035548929023376677, | |
| "loss": 1.2832, | |
| "mean_token_accuracy": 0.6730219264825185, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.2034987504462693, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 0.00035440610732472564, | |
| "loss": 1.3105, | |
| "mean_token_accuracy": 0.6644942373037338, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.2077829346661906, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 0.00035332054516039834, | |
| "loss": 1.3084, | |
| "mean_token_accuracy": 0.6659467816352844, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.212067118886112, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 0.0003522326284790842, | |
| "loss": 1.3263, | |
| "mean_token_accuracy": 0.6606633335351944, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.2163513031060336, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 0.000351142382072738, | |
| "loss": 1.3269, | |
| "mean_token_accuracy": 0.6555161933104198, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.220635487325955, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 0.0003500498307864057, | |
| "loss": 1.3094, | |
| "mean_token_accuracy": 0.6611082951227824, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.2249196715458766, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 0.00034895499951765805, | |
| "loss": 1.3298, | |
| "mean_token_accuracy": 0.6599132279555003, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.2292038557657978, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 0.0003478579132160226, | |
| "loss": 1.3194, | |
| "mean_token_accuracy": 0.6632148762543996, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.2334880399857193, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 0.00034675859688241607, | |
| "loss": 1.3216, | |
| "mean_token_accuracy": 0.6630672425031662, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.2377722242056408, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 0.00034565707556857405, | |
| "loss": 1.3175, | |
| "mean_token_accuracy": 0.6651228954394658, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.2420564084255623, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 0.0003445533743764804, | |
| "loss": 1.2983, | |
| "mean_token_accuracy": 0.6665589342514674, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.2463405926454838, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00034344751845779485, | |
| "loss": 1.332, | |
| "mean_token_accuracy": 0.6578145096699397, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.250624776865405, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 0.00034233953301328026, | |
| "loss": 1.3221, | |
| "mean_token_accuracy": 0.6605254203081131, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.2549089610853268, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.0003412294432922278, | |
| "loss": 1.3093, | |
| "mean_token_accuracy": 0.665352334578832, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.259193145305248, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 0.00034011727459188224, | |
| "loss": 1.2938, | |
| "mean_token_accuracy": 0.6671251505613327, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.2634773295251696, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 0.000339003052256865, | |
| "loss": 1.283, | |
| "mean_token_accuracy": 0.666038570801417, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.267761513745091, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 0.0003378868016785966, | |
| "loss": 1.3174, | |
| "mean_token_accuracy": 0.6619399875402451, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.2720456979650125, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.000336768548294718, | |
| "loss": 1.3422, | |
| "mean_token_accuracy": 0.6565162986516953, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.276329882184934, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 0.00033564831758851145, | |
| "loss": 1.2966, | |
| "mean_token_accuracy": 0.6664889295895894, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.2806140664048553, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.0003345261350883189, | |
| "loss": 1.3002, | |
| "mean_token_accuracy": 0.6681903342405955, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.2848982506247768, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.00033340202636696103, | |
| "loss": 1.3375, | |
| "mean_token_accuracy": 0.6571354617675146, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.2891824348446983, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.0003322760170411539, | |
| "loss": 1.3163, | |
| "mean_token_accuracy": 0.6622751196225484, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.2934666190646198, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 0.00033114813277092557, | |
| "loss": 1.3095, | |
| "mean_token_accuracy": 0.6604131867488225, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.2977508032845413, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 0.00033001839925903123, | |
| "loss": 1.269, | |
| "mean_token_accuracy": 0.670791240533193, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.3020349875044626, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.00032888684225036735, | |
| "loss": 1.324, | |
| "mean_token_accuracy": 0.6640015542507172, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.3063191717243843, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.0003277534875313851, | |
| "loss": 1.2934, | |
| "mean_token_accuracy": 0.6645639598369598, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.3106033559443055, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.000326618360929503, | |
| "loss": 1.2911, | |
| "mean_token_accuracy": 0.662736972173055, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.314887540164227, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.0003254814883125176, | |
| "loss": 1.3102, | |
| "mean_token_accuracy": 0.6641101191441218, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.3191717243841485, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.00032434289558801486, | |
| "loss": 1.2946, | |
| "mean_token_accuracy": 0.669797545671463, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.32345590860407, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 0.00032320260870277907, | |
| "loss": 1.3234, | |
| "mean_token_accuracy": 0.662629238764445, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.3277400928239915, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 0.00032206065364220204, | |
| "loss": 1.2875, | |
| "mean_token_accuracy": 0.6695479412873586, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.3320242770439128, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.0003209170564296907, | |
| "loss": 1.3185, | |
| "mean_token_accuracy": 0.6618772675593694, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.3363084612638343, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 0.00031977184312607406, | |
| "loss": 1.2956, | |
| "mean_token_accuracy": 0.6677195648352305, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.3405926454837558, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.0003186250398290093, | |
| "loss": 1.31, | |
| "mean_token_accuracy": 0.662284501393636, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.3448768297036773, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.0003174766726723873, | |
| "loss": 1.3053, | |
| "mean_token_accuracy": 0.6651754269997279, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.3491610139235988, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.000316326767825737, | |
| "loss": 1.3318, | |
| "mean_token_accuracy": 0.658694452047348, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.35344519814352, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 0.0003151753514936285, | |
| "loss": 1.2949, | |
| "mean_token_accuracy": 0.6696987201770147, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.3577293823634418, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00031402244991507656, | |
| "loss": 1.3184, | |
| "mean_token_accuracy": 0.6619691550731659, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.362013566583363, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 0.00031286808936294266, | |
| "loss": 1.2695, | |
| "mean_token_accuracy": 0.6740480273962021, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.3662977508032845, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.00031171229614333567, | |
| "loss": 1.2973, | |
| "mean_token_accuracy": 0.6682296107212703, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.370581935023206, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.0003105550965950132, | |
| "loss": 1.2879, | |
| "mean_token_accuracy": 0.6695340464512507, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.3748661192431275, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.0003093965170887804, | |
| "loss": 1.3382, | |
| "mean_token_accuracy": 0.6577111542224884, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.379150303463049, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 0.00030823658402689004, | |
| "loss": 1.3005, | |
| "mean_token_accuracy": 0.6613259871800741, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.3834344876829703, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 0.0003070753238424401, | |
| "loss": 1.2846, | |
| "mean_token_accuracy": 0.6712899088859559, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.3877186719028918, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.0003059127629987715, | |
| "loss": 1.3022, | |
| "mean_token_accuracy": 0.6612801601489385, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.3920028561228133, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 0.00030474892798886574, | |
| "loss": 1.2673, | |
| "mean_token_accuracy": 0.6732950339714686, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.3962870403427348, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.00030358384533473993, | |
| "loss": 1.2745, | |
| "mean_token_accuracy": 0.6716413120428721, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.4005712245626563, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 0.0003024175415868436, | |
| "loss": 1.2964, | |
| "mean_token_accuracy": 0.6657003333171209, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.4048554087825775, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.00030125004332345293, | |
| "loss": 1.3154, | |
| "mean_token_accuracy": 0.6634772340456645, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.409139593002499, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.0003000813771500652, | |
| "loss": 1.2857, | |
| "mean_token_accuracy": 0.6668003877003987, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.4134237772224205, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.00029891156969879276, | |
| "loss": 1.2846, | |
| "mean_token_accuracy": 0.6660936713218689, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.417707961442342, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 0.00029774064762775584, | |
| "loss": 1.3119, | |
| "mean_token_accuracy": 0.6623692701260249, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.4219921456622635, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.00029656863762047507, | |
| "loss": 1.2528, | |
| "mean_token_accuracy": 0.6746878981590271, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.426276329882185, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.0002953955663852637, | |
| "loss": 1.2755, | |
| "mean_token_accuracy": 0.6691719969113668, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.4305605141021065, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.00029422146065461846, | |
| "loss": 1.291, | |
| "mean_token_accuracy": 0.6642946968475978, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.4348446983220278, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.0002930463471846109, | |
| "loss": 1.2858, | |
| "mean_token_accuracy": 0.6680060108502706, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.4391288825419493, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 0.00029187025275427726, | |
| "loss": 1.3238, | |
| "mean_token_accuracy": 0.6617356767257054, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.4434130667618708, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.0002906932041650083, | |
| "loss": 1.2809, | |
| "mean_token_accuracy": 0.6699890126784642, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.4476972509817922, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.00028951522823993884, | |
| "loss": 1.2368, | |
| "mean_token_accuracy": 0.6795000006755193, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.4519814352017137, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.0002883363518233361, | |
| "loss": 1.2939, | |
| "mean_token_accuracy": 0.6654811183611552, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.456265619421635, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0002871566017799881, | |
| "loss": 1.2918, | |
| "mean_token_accuracy": 0.6694387882947922, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.4605498036415565, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.0002859760049945915, | |
| "loss": 1.2996, | |
| "mean_token_accuracy": 0.6622726023197174, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.464833987861478, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 0.00028479458837113886, | |
| "loss": 1.2907, | |
| "mean_token_accuracy": 0.6693145235379537, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.4691181720813995, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 0.00028361237883230595, | |
| "loss": 1.2589, | |
| "mean_token_accuracy": 0.6747862150271734, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.473402356301321, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.00028242940331883726, | |
| "loss": 1.301, | |
| "mean_token_accuracy": 0.665789802869161, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.4776865405212425, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 0.00028124568878893323, | |
| "loss": 1.2522, | |
| "mean_token_accuracy": 0.6749315430720647, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.481970724741164, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.000280061262217635, | |
| "loss": 1.2831, | |
| "mean_token_accuracy": 0.6701455026865005, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.4862549089610853, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 0.0002788761505962102, | |
| "loss": 1.2951, | |
| "mean_token_accuracy": 0.6682408134142558, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.4905390931810067, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 0.00027769038093153765, | |
| "loss": 1.2898, | |
| "mean_token_accuracy": 0.6709642231464386, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.4948232774009282, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.000276503980245492, | |
| "loss": 1.2709, | |
| "mean_token_accuracy": 0.6721013089021047, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.4991074616208497, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.0002753169755743277, | |
| "loss": 1.2677, | |
| "mean_token_accuracy": 0.6691457152366638, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.5033916458407712, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 0.0002741293939680637, | |
| "loss": 1.2841, | |
| "mean_token_accuracy": 0.6697168608506521, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.5076758300606925, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.00027294126248986563, | |
| "loss": 1.2824, | |
| "mean_token_accuracy": 0.6717678278684616, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.5119600142806142, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 0.0002717526082154304, | |
| "loss": 1.2587, | |
| "mean_token_accuracy": 0.6737531036138534, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.5162441985005355, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 0.00027056345823236837, | |
| "loss": 1.265, | |
| "mean_token_accuracy": 0.6732283522685368, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.520528382720457, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0002693738396395866, | |
| "loss": 1.2674, | |
| "mean_token_accuracy": 0.671101305882136, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.5248125669403785, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.00026818377954667083, | |
| "loss": 1.294, | |
| "mean_token_accuracy": 0.6672694971164067, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.5290967511602997, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.0002669933050732679, | |
| "loss": 1.2372, | |
| "mean_token_accuracy": 0.6788262248039245, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.5333809353802215, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.000265802443348468, | |
| "loss": 1.2665, | |
| "mean_token_accuracy": 0.6717091699441274, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.5376651196001427, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 0.0002646112215101858, | |
| "loss": 1.2526, | |
| "mean_token_accuracy": 0.6772685488065083, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.5419493038200642, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.0002634196667045428, | |
| "loss": 1.2361, | |
| "mean_token_accuracy": 0.6766181766986847, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.5462334880399857, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.0002622278060852481, | |
| "loss": 1.2519, | |
| "mean_token_accuracy": 0.6780080666144689, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.5505176722599072, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.00026103566681297973, | |
| "loss": 1.2568, | |
| "mean_token_accuracy": 0.6755953232447306, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.5548018564798287, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00025984327605476607, | |
| "loss": 1.2607, | |
| "mean_token_accuracy": 0.6782088299592336, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.55908604069975, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.0002586506609833662, | |
| "loss": 1.2753, | |
| "mean_token_accuracy": 0.6692374924818675, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.5633702249196717, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.00025745784877665123, | |
| "loss": 1.2738, | |
| "mean_token_accuracy": 0.6710573007663091, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.567654409139593, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 0.00025626486661698447, | |
| "loss": 1.2333, | |
| "mean_token_accuracy": 0.6815130422512691, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.5719385933595145, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.0002550717416906022, | |
| "loss": 1.2424, | |
| "mean_token_accuracy": 0.6788204977909724, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.576222777579436, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 0.00025387850118699433, | |
| "loss": 1.2701, | |
| "mean_token_accuracy": 0.6694417973359426, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.5805069617993572, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.00025268517229828436, | |
| "loss": 1.2538, | |
| "mean_token_accuracy": 0.6743627349535625, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.584791146019279, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 0.00025149178221861015, | |
| "loss": 1.2538, | |
| "mean_token_accuracy": 0.6739106665054957, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.5890753302392002, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 0.0002502983581435038, | |
| "loss": 1.2443, | |
| "mean_token_accuracy": 0.6750338355700175, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.5933595144591217, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.00024910492726927237, | |
| "loss": 1.2376, | |
| "mean_token_accuracy": 0.6762576440970103, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.5976436986790432, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.0002479115167923776, | |
| "loss": 1.2506, | |
| "mean_token_accuracy": 0.6760666708151499, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.6019278828989647, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 0.0002467181539088166, | |
| "loss": 1.2515, | |
| "mean_token_accuracy": 0.6759268889824549, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.6062120671188862, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0002455248658135018, | |
| "loss": 1.2563, | |
| "mean_token_accuracy": 0.6760497013727824, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.6104962513388075, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.0002443316796996414, | |
| "loss": 1.2115, | |
| "mean_token_accuracy": 0.6829163481791815, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.6147804355587292, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.00024313862275811954, | |
| "loss": 1.2524, | |
| "mean_token_accuracy": 0.6750467717647552, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.6190646197786505, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.00024194572217687657, | |
| "loss": 1.2558, | |
| "mean_token_accuracy": 0.6752542575200399, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.623348803998572, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 0.00024075300514028996, | |
| "loss": 1.2014, | |
| "mean_token_accuracy": 0.6858539601167043, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.6276329882184934, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 0.00023956049882855435, | |
| "loss": 1.2438, | |
| "mean_token_accuracy": 0.6779787172873815, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.6319171724384147, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 0.00023836823041706214, | |
| "loss": 1.2752, | |
| "mean_token_accuracy": 0.6702134589354197, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.6362013566583364, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.00023717622707578444, | |
| "loss": 1.2694, | |
| "mean_token_accuracy": 0.6743356436491013, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.6404855408782577, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 0.00023598451596865185, | |
| "loss": 1.2574, | |
| "mean_token_accuracy": 0.6762406408786774, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.6447697250981792, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.00023479312425293532, | |
| "loss": 1.2388, | |
| "mean_token_accuracy": 0.6752854565779368, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.6490539093181007, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.00023360207907862753, | |
| "loss": 1.2227, | |
| "mean_token_accuracy": 0.6849334806203842, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.653338093538022, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.00023241140758782387, | |
| "loss": 1.2652, | |
| "mean_token_accuracy": 0.674446169535319, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.6576222777579437, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 0.00023122113691410396, | |
| "loss": 1.2314, | |
| "mean_token_accuracy": 0.6808192272981007, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.661906461977865, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 0.00023003129418191356, | |
| "loss": 1.2636, | |
| "mean_token_accuracy": 0.6753036012252172, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.6661906461977865, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.00022884190650594648, | |
| "loss": 1.2169, | |
| "mean_token_accuracy": 0.6835887253284454, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.670474830417708, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 0.00022765300099052607, | |
| "loss": 1.2232, | |
| "mean_token_accuracy": 0.679744150241216, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.6747590146376294, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.00022646460472898824, | |
| "loss": 1.2182, | |
| "mean_token_accuracy": 0.6807852476835251, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.679043198857551, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00022527674480306382, | |
| "loss": 1.2026, | |
| "mean_token_accuracy": 0.6893678406874338, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.6833273830774722, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.00022408944828226113, | |
| "loss": 1.234, | |
| "mean_token_accuracy": 0.6771794279416402, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.687611567297394, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.00022290274222324971, | |
| "loss": 1.235, | |
| "mean_token_accuracy": 0.6769244899352391, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.6918957515173152, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.00022171665366924303, | |
| "loss": 1.2525, | |
| "mean_token_accuracy": 0.6755237986644109, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.6961799357372367, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 0.0002205312096493829, | |
| "loss": 1.2201, | |
| "mean_token_accuracy": 0.6836084206899007, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.7004641199571582, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.00021934643717812281, | |
| "loss": 1.2055, | |
| "mean_token_accuracy": 0.685323445002238, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.7047483041770795, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.0002181623632546129, | |
| "loss": 1.2321, | |
| "mean_token_accuracy": 0.6818149715662003, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.7090324883970012, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.00021697901486208458, | |
| "loss": 1.2101, | |
| "mean_token_accuracy": 0.6855930884679159, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.7133166726169224, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 0.0002157964189672353, | |
| "loss": 1.2379, | |
| "mean_token_accuracy": 0.6786120980978012, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.717600856836844, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 0.0002146146025196144, | |
| "loss": 1.2587, | |
| "mean_token_accuracy": 0.676125301917394, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.7218850410567654, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.00021343359245100873, | |
| "loss": 1.2412, | |
| "mean_token_accuracy": 0.6767153064409892, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.726169225276687, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.0002122534156748289, | |
| "loss": 1.2144, | |
| "mean_token_accuracy": 0.68453202744325, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.7304534094966084, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.00021107409908549632, | |
| "loss": 1.2382, | |
| "mean_token_accuracy": 0.6767129331827164, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.7347375937165297, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.00020989566955782992, | |
| "loss": 1.2235, | |
| "mean_token_accuracy": 0.6808336655298869, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.7390217779364514, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.00020871815394643385, | |
| "loss": 1.2331, | |
| "mean_token_accuracy": 0.6781981885433197, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.7433059621563727, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 0.00020754157908508536, | |
| "loss": 1.2358, | |
| "mean_token_accuracy": 0.6778120845556259, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.7475901463762942, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.00020636597178612365, | |
| "loss": 1.2209, | |
| "mean_token_accuracy": 0.6791574577490489, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.7518743305962157, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 0.00020519135883983878, | |
| "loss": 1.2202, | |
| "mean_token_accuracy": 0.6824594676494599, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.756158514816137, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.0002040177670138607, | |
| "loss": 1.2279, | |
| "mean_token_accuracy": 0.6825836052497228, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.7604426990360587, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.0002028452230525497, | |
| "loss": 1.1914, | |
| "mean_token_accuracy": 0.6880497256914775, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.76472688325598, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 0.00020167375367638707, | |
| "loss": 1.2336, | |
| "mean_token_accuracy": 0.6778925875822703, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.7690110674759014, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0002005033855813655, | |
| "loss": 1.248, | |
| "mean_token_accuracy": 0.6761407842238744, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.773295251695823, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.0001993341454383817, | |
| "loss": 1.2239, | |
| "mean_token_accuracy": 0.6766098787387212, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.7775794359157444, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0001981660598926277, | |
| "loss": 1.2034, | |
| "mean_token_accuracy": 0.6846154699722926, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.781863620135666, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.00019699915556298413, | |
| "loss": 1.1972, | |
| "mean_token_accuracy": 0.6866144865751267, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.7861478043555872, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 0.0001958334590414136, | |
| "loss": 1.2012, | |
| "mean_token_accuracy": 0.6865098079045614, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.790431988575509, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.00019466899689235434, | |
| "loss": 1.2154, | |
| "mean_token_accuracy": 0.6773028880357742, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.7947161727954302, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.00019350579565211563, | |
| "loss": 1.2082, | |
| "mean_token_accuracy": 0.6842084477345148, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.7990003570153517, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.0001923438818282721, | |
| "loss": 1.2321, | |
| "mean_token_accuracy": 0.680533907810847, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.8032845412352732, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00019118328189906037, | |
| "loss": 1.2276, | |
| "mean_token_accuracy": 0.6793491671482722, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.8075687254551944, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 0.00019002402231277533, | |
| "loss": 1.1895, | |
| "mean_token_accuracy": 0.6883792887131374, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.8118529096751161, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 0.00018886612948716737, | |
| "loss": 1.196, | |
| "mean_token_accuracy": 0.6824484934409459, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.8161370938950374, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 0.00018770962980884086, | |
| "loss": 1.2068, | |
| "mean_token_accuracy": 0.6837878266970316, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.820421278114959, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.0001865545496326523, | |
| "loss": 1.2161, | |
| "mean_token_accuracy": 0.6800723781188329, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.8247054623348804, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.00018540091528110973, | |
| "loss": 1.1971, | |
| "mean_token_accuracy": 0.6874994953473409, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.828989646554802, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 0.0001842487530437732, | |
| "loss": 1.1987, | |
| "mean_token_accuracy": 0.6840162913004557, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.8332738307747234, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.00018309808917665562, | |
| "loss": 1.1892, | |
| "mean_token_accuracy": 0.6898854543765386, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.8375580149946447, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00018194894990162424, | |
| "loss": 1.1957, | |
| "mean_token_accuracy": 0.6860898315906525, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.8418421992145664, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 0.00018080136140580328, | |
| "loss": 1.2323, | |
| "mean_token_accuracy": 0.6818597843249639, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.8461263834344876, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.00017965534984097696, | |
| "loss": 1.2124, | |
| "mean_token_accuracy": 0.6838676472504933, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.8504105676544091, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.00017851094132299362, | |
| "loss": 1.1997, | |
| "mean_token_accuracy": 0.6875764499107997, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.8546947518743306, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.00017736816193117066, | |
| "loss": 1.1956, | |
| "mean_token_accuracy": 0.6873005121946335, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.858978936094252, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.0001762270377077005, | |
| "loss": 1.1768, | |
| "mean_token_accuracy": 0.6917450726032257, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.8632631203141736, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.0001750875946570564, | |
| "loss": 1.2165, | |
| "mean_token_accuracy": 0.6850099762280782, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.867547304534095, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00017394985874540032, | |
| "loss": 1.1952, | |
| "mean_token_accuracy": 0.6887429515520732, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.8718314887540164, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 0.00017281385589999133, | |
| "loss": 1.1916, | |
| "mean_token_accuracy": 0.6882669021685918, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.8761156729739379, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.00017167961200859432, | |
| "loss": 1.2191, | |
| "mean_token_accuracy": 0.6836802691221238, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.8803998571938594, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.00017054715291889072, | |
| "loss": 1.1771, | |
| "mean_token_accuracy": 0.689866092801094, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.8846840414137809, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00016941650443788857, | |
| "loss": 1.1708, | |
| "mean_token_accuracy": 0.6898455142974853, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.8889682256337021, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 0.00016828769233133528, | |
| "loss": 1.2134, | |
| "mean_token_accuracy": 0.6872386793295543, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.8932524098536239, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.00016716074232312993, | |
| "loss": 1.2143, | |
| "mean_token_accuracy": 0.6822999050219853, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.8975365940735451, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.00016603568009473715, | |
| "loss": 1.1919, | |
| "mean_token_accuracy": 0.6847109029690425, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.9018207782934666, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.00016491253128460222, | |
| "loss": 1.1944, | |
| "mean_token_accuracy": 0.6889555603265762, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.9061049625133881, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00016379132148756638, | |
| "loss": 1.2015, | |
| "mean_token_accuracy": 0.6848962704340616, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.9103891467333094, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.00016267207625428375, | |
| "loss": 1.2066, | |
| "mean_token_accuracy": 0.6844677517811457, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.9146733309532311, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00016155482109063898, | |
| "loss": 1.1895, | |
| "mean_token_accuracy": 0.689970392982165, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.9189575151731524, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.00016043958145716615, | |
| "loss": 1.1808, | |
| "mean_token_accuracy": 0.6905927946170171, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.9232416993930739, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 0.00015932638276846853, | |
| "loss": 1.1895, | |
| "mean_token_accuracy": 0.6876069366931915, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.9275258836129954, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.00015821525039263945, | |
| "loss": 1.1847, | |
| "mean_token_accuracy": 0.6875915179649988, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.9318100678329169, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.00015710620965068395, | |
| "loss": 1.2023, | |
| "mean_token_accuracy": 0.6880812575419744, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.9360942520528384, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.00015599928581594197, | |
| "loss": 1.1944, | |
| "mean_token_accuracy": 0.6877446641524633, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.9403784362727596, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.00015489450411351247, | |
| "loss": 1.2015, | |
| "mean_token_accuracy": 0.6858119696378708, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.9446626204926813, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.00015379188971967854, | |
| "loss": 1.1747, | |
| "mean_token_accuracy": 0.6891631742318471, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.9489468047126026, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.00015269146776133346, | |
| "loss": 1.1804, | |
| "mean_token_accuracy": 0.6899756520986557, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.9532309889325241, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.00015159326331540835, | |
| "loss": 1.1935, | |
| "mean_token_accuracy": 0.6860665520032246, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.9575151731524456, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.00015049730140830064, | |
| "loss": 1.1958, | |
| "mean_token_accuracy": 0.6895552823940913, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.9617993573723669, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.0001494036070153036, | |
| "loss": 1.1593, | |
| "mean_token_accuracy": 0.6954157501459122, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.9660835415922886, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.0001483122050600376, | |
| "loss": 1.1817, | |
| "mean_token_accuracy": 0.6880984852711359, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.9703677258122099, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 0.00014722312041388162, | |
| "loss": 1.1708, | |
| "mean_token_accuracy": 0.6934361755847931, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.9746519100321314, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.00014613637789540683, | |
| "loss": 1.1526, | |
| "mean_token_accuracy": 0.6956786572933197, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.9789360942520529, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 0.0001450520022698108, | |
| "loss": 1.1659, | |
| "mean_token_accuracy": 0.6891820311546326, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.9832202784719741, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.0001439700182483532, | |
| "loss": 1.1926, | |
| "mean_token_accuracy": 0.6883834769328435, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.9875044626918958, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00014289045048779316, | |
| "loss": 1.179, | |
| "mean_token_accuracy": 0.6905747185150782, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.9917886469118171, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.00014181332358982615, | |
| "loss": 1.1518, | |
| "mean_token_accuracy": 0.6962420682112376, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.9960728311317386, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.00014073866210052478, | |
| "loss": 1.1798, | |
| "mean_token_accuracy": 0.6898052622874578, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 0.00013966649050977853, | |
| "loss": 1.1623, | |
| "mean_token_accuracy": 0.6901761282574047, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 2.0042841842199213, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.00013859683325073563, | |
| "loss": 0.848, | |
| "mean_token_accuracy": 0.7650491803884506, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 2.008568368439843, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00013752971469924727, | |
| "loss": 0.863, | |
| "mean_token_accuracy": 0.7565648088852565, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 2.0128525526597643, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.00013646515917331055, | |
| "loss": 0.8361, | |
| "mean_token_accuracy": 0.7650597403446834, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.017136736879686, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.00013540319093251565, | |
| "loss": 0.8601, | |
| "mean_token_accuracy": 0.7594423244396845, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 2.0214209210996072, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 0.00013434383417749247, | |
| "loss": 0.8135, | |
| "mean_token_accuracy": 0.7731389890114466, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 2.0257051053195285, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.0001332871130493587, | |
| "loss": 0.8556, | |
| "mean_token_accuracy": 0.7604803055524826, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 2.0299892895394502, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 0.0001322330516291709, | |
| "loss": 0.842, | |
| "mean_token_accuracy": 0.7640973548094432, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 2.0342734737593715, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 0.0001311816739373742, | |
| "loss": 0.8381, | |
| "mean_token_accuracy": 0.7674211412668228, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.0385576579792932, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.00013013300393325611, | |
| "loss": 0.8126, | |
| "mean_token_accuracy": 0.7714175979296366, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 2.0428418421992145, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.00012908706551440004, | |
| "loss": 0.8481, | |
| "mean_token_accuracy": 0.7640273501475652, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 2.047126026419136, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00012804388251614037, | |
| "loss": 0.8311, | |
| "mean_token_accuracy": 0.7653455446163814, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 2.0514102106390575, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.00012700347871102036, | |
| "loss": 0.8258, | |
| "mean_token_accuracy": 0.7644449760516484, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 2.0556943948589788, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.00012596587780824923, | |
| "loss": 0.8364, | |
| "mean_token_accuracy": 0.7657805611689885, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.0599785790789005, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 0.0001249311034531623, | |
| "loss": 0.8471, | |
| "mean_token_accuracy": 0.7625262240568796, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 2.0642627632988217, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00012389917922668245, | |
| "loss": 0.823, | |
| "mean_token_accuracy": 0.769603114326795, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 2.0685469475187435, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 0.0001228701286447824, | |
| "loss": 0.8242, | |
| "mean_token_accuracy": 0.7667522599299749, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 2.0728311317386647, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.00012184397515794888, | |
| "loss": 0.843, | |
| "mean_token_accuracy": 0.7642757395903269, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 2.077115315958586, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.00012082074215064836, | |
| "loss": 0.8298, | |
| "mean_token_accuracy": 0.7669574290513992, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.0813995001785077, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00011980045294079384, | |
| "loss": 0.8241, | |
| "mean_token_accuracy": 0.7687270969152451, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 2.085683684398429, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.00011878313077921388, | |
| "loss": 0.8395, | |
| "mean_token_accuracy": 0.76190112332503, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 2.0899678686183507, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.00011776879884912247, | |
| "loss": 0.8234, | |
| "mean_token_accuracy": 0.7668329626321793, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 2.094252052838272, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00011675748026559091, | |
| "loss": 0.8221, | |
| "mean_token_accuracy": 0.7703782876332601, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 2.0985362370581937, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.00011574919807502091, | |
| "loss": 0.8332, | |
| "mean_token_accuracy": 0.7663482298453649, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.102820421278115, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 0.00011474397525461919, | |
| "loss": 0.8166, | |
| "mean_token_accuracy": 0.7704643438259761, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 2.1071046054980362, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.0001137418347118744, | |
| "loss": 0.8254, | |
| "mean_token_accuracy": 0.7690837909777959, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 2.111388789717958, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 0.00011274279928403475, | |
| "loss": 0.8058, | |
| "mean_token_accuracy": 0.7719497382640839, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 2.1156729739378792, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.00011174689173758759, | |
| "loss": 0.8294, | |
| "mean_token_accuracy": 0.7664976229270299, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 2.119957158157801, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 0.00011075413476774066, | |
| "loss": 0.8292, | |
| "mean_token_accuracy": 0.7666595439116161, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.124241342377722, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.00010976455099790491, | |
| "loss": 0.8326, | |
| "mean_token_accuracy": 0.7672491510709126, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 2.1285255265976435, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.00010877816297917881, | |
| "loss": 0.8007, | |
| "mean_token_accuracy": 0.7737640182177226, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 2.132809710817565, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 0.00010779499318983463, | |
| "loss": 0.8092, | |
| "mean_token_accuracy": 0.7724389503399531, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 2.1370938950374865, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.00010681506403480617, | |
| "loss": 0.8111, | |
| "mean_token_accuracy": 0.7739120999972026, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 2.141378079257408, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.00010583839784517812, | |
| "loss": 0.82, | |
| "mean_token_accuracy": 0.7729650676250458, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.1456622634773295, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.00010486501687767719, | |
| "loss": 0.792, | |
| "mean_token_accuracy": 0.776722161968549, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 2.149946447697251, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 0.00010389494331416477, | |
| "loss": 0.8301, | |
| "mean_token_accuracy": 0.7654323528210322, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 2.1542306319171725, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.0001029281992611317, | |
| "loss": 0.7938, | |
| "mean_token_accuracy": 0.7787847359975179, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 2.1585148161370937, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 0.00010196480674919443, | |
| "loss": 0.8205, | |
| "mean_token_accuracy": 0.7693845987319946, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 2.1627990003570154, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.0001010047877325928, | |
| "loss": 0.7904, | |
| "mean_token_accuracy": 0.7785027374823889, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.1670831845769367, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 0.00010004816408869002, | |
| "loss": 0.8308, | |
| "mean_token_accuracy": 0.7703908115625382, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 2.1713673687968584, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 9.909495761747372e-05, | |
| "loss": 0.8253, | |
| "mean_token_accuracy": 0.7665383875370025, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 2.1756515530167797, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 9.814519004105946e-05, | |
| "loss": 0.8144, | |
| "mean_token_accuracy": 0.7718339473009109, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 2.179935737236701, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 9.719888300319602e-05, | |
| "loss": 0.8283, | |
| "mean_token_accuracy": 0.7683377832174301, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 2.1842199214566227, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 9.625605806877122e-05, | |
| "loss": 0.8472, | |
| "mean_token_accuracy": 0.7613804837067922, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.188504105676544, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 9.531673672332145e-05, | |
| "loss": 0.8098, | |
| "mean_token_accuracy": 0.7719518701235454, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 2.1927882898964657, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 9.438094037254172e-05, | |
| "loss": 0.7964, | |
| "mean_token_accuracy": 0.7749660849571228, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 2.197072474116387, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 9.344869034179735e-05, | |
| "loss": 0.8143, | |
| "mean_token_accuracy": 0.7723793685436249, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 2.2013566583363087, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 9.252000787563919e-05, | |
| "loss": 0.8087, | |
| "mean_token_accuracy": 0.7712360550959905, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 2.20564084255623, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 9.159491413731805e-05, | |
| "loss": 0.7954, | |
| "mean_token_accuracy": 0.7725468198458354, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.209925026776151, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 9.067343020830352e-05, | |
| "loss": 0.8136, | |
| "mean_token_accuracy": 0.7698225736618042, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 2.214209210996073, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 8.975557708780316e-05, | |
| "loss": 0.8181, | |
| "mean_token_accuracy": 0.769992024699847, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 2.218493395215994, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 8.884137569228362e-05, | |
| "loss": 0.8098, | |
| "mean_token_accuracy": 0.770973147948583, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 2.222777579435916, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 8.793084685499498e-05, | |
| "loss": 0.7998, | |
| "mean_token_accuracy": 0.776837948958079, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 2.227061763655837, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 8.702401132549459e-05, | |
| "loss": 0.7891, | |
| "mean_token_accuracy": 0.7805549720923106, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.2313459478757585, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 8.612088976917554e-05, | |
| "loss": 0.8364, | |
| "mean_token_accuracy": 0.7643806467453639, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 2.23563013209568, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 8.522150276679494e-05, | |
| "loss": 0.8184, | |
| "mean_token_accuracy": 0.7683983782927195, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 2.2399143163156015, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 8.432587081400492e-05, | |
| "loss": 0.7959, | |
| "mean_token_accuracy": 0.778293655316035, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 2.244198500535523, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 8.343401432088629e-05, | |
| "loss": 0.7847, | |
| "mean_token_accuracy": 0.7779005120197932, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 2.2484826847554444, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 8.254595361148262e-05, | |
| "loss": 0.7926, | |
| "mean_token_accuracy": 0.7749503721793493, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.2527668689753657, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 8.166170892333722e-05, | |
| "loss": 0.7907, | |
| "mean_token_accuracy": 0.7779211064179739, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 2.2570510531952874, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 8.078130040703238e-05, | |
| "loss": 0.7985, | |
| "mean_token_accuracy": 0.7758079042037328, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 2.2613352374152087, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 7.990474812572981e-05, | |
| "loss": 0.7876, | |
| "mean_token_accuracy": 0.7797471781571707, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 2.2656194216351304, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 7.903207205471358e-05, | |
| "loss": 0.8053, | |
| "mean_token_accuracy": 0.7768173178037008, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 2.2699036058550517, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 7.81632920809349e-05, | |
| "loss": 0.7792, | |
| "mean_token_accuracy": 0.7814870576063792, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.2741877900749734, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 7.729842800255865e-05, | |
| "loss": 0.8226, | |
| "mean_token_accuracy": 0.768639792005221, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 2.2784719742948947, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 7.64374995285127e-05, | |
| "loss": 0.7939, | |
| "mean_token_accuracy": 0.7751558671394984, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 2.282756158514816, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 7.558052627803846e-05, | |
| "loss": 0.7966, | |
| "mean_token_accuracy": 0.7758613834778468, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 2.2870403427347377, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 7.472752778024383e-05, | |
| "loss": 0.7812, | |
| "mean_token_accuracy": 0.7802222698926926, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 2.291324526954659, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 7.387852347365829e-05, | |
| "loss": 0.7849, | |
| "mean_token_accuracy": 0.7787080804506937, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.2956087111745807, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 7.303353270578952e-05, | |
| "loss": 0.8034, | |
| "mean_token_accuracy": 0.770248160759608, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 2.299892895394502, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 7.219257473268312e-05, | |
| "loss": 0.8094, | |
| "mean_token_accuracy": 0.7722504367431005, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 2.3041770796144236, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 7.135566871848331e-05, | |
| "loss": 0.8008, | |
| "mean_token_accuracy": 0.7736594120661417, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 2.308461263834345, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 7.052283373499649e-05, | |
| "loss": 0.797, | |
| "mean_token_accuracy": 0.7745833595593771, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 2.312745448054266, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 6.969408876125637e-05, | |
| "loss": 0.7898, | |
| "mean_token_accuracy": 0.77644149462382, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.317029632274188, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 6.886945268309177e-05, | |
| "loss": 0.8053, | |
| "mean_token_accuracy": 0.7735361973444621, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 2.321313816494109, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 6.804894429269582e-05, | |
| "loss": 0.7936, | |
| "mean_token_accuracy": 0.7798129876454671, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 2.325598000714031, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 6.723258228819815e-05, | |
| "loss": 0.796, | |
| "mean_token_accuracy": 0.7762234061956406, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 2.329882184933952, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 6.64203852732386e-05, | |
| "loss": 0.8006, | |
| "mean_token_accuracy": 0.7752728084723155, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 2.3341663691538734, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 6.561237175654325e-05, | |
| "loss": 0.7862, | |
| "mean_token_accuracy": 0.7770834614833196, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.338450553373795, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 6.480856015150272e-05, | |
| "loss": 0.8179, | |
| "mean_token_accuracy": 0.7689370075861613, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 2.3427347375937164, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 6.40089687757523e-05, | |
| "loss": 0.7858, | |
| "mean_token_accuracy": 0.7789360960324605, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 2.347018921813638, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 6.32136158507549e-05, | |
| "loss": 0.8105, | |
| "mean_token_accuracy": 0.7707864145437876, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 2.3513031060335594, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 6.242251950138564e-05, | |
| "loss": 0.7907, | |
| "mean_token_accuracy": 0.7774596979220708, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 2.3555872902534807, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 6.163569775551869e-05, | |
| "loss": 0.7875, | |
| "mean_token_accuracy": 0.780629759033521, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.3598714744734024, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 6.0853168543616694e-05, | |
| "loss": 0.8057, | |
| "mean_token_accuracy": 0.7729009737571081, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 2.3641556586933237, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 6.007494969832181e-05, | |
| "loss": 0.7709, | |
| "mean_token_accuracy": 0.7820465574661891, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 2.3684398429132454, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 5.9301058954049664e-05, | |
| "loss": 0.7857, | |
| "mean_token_accuracy": 0.7776114324728648, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 2.3727240271331667, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 5.853151394658526e-05, | |
| "loss": 0.7841, | |
| "mean_token_accuracy": 0.777163389325142, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 2.3770082113530884, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 5.776633221268057e-05, | |
| "loss": 0.7933, | |
| "mean_token_accuracy": 0.7760427872339885, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.3812923955730096, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 5.7005531189655515e-05, | |
| "loss": 0.8003, | |
| "mean_token_accuracy": 0.7774786601463953, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 2.385576579792931, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 5.624912821500025e-05, | |
| "loss": 0.789, | |
| "mean_token_accuracy": 0.77956602871418, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 2.3898607640128526, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 5.5497140525979925e-05, | |
| "loss": 0.8054, | |
| "mean_token_accuracy": 0.7731809784968694, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 2.394144948232774, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 5.474958525924262e-05, | |
| "loss": 0.7969, | |
| "mean_token_accuracy": 0.7769083728392919, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 2.3984291324526956, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 5.4006479450427694e-05, | |
| "loss": 0.7873, | |
| "mean_token_accuracy": 0.7816483676433563, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.402713316672617, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 5.3267840033778516e-05, | |
| "loss": 0.7924, | |
| "mean_token_accuracy": 0.7759514023860296, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 2.4069975008925386, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 5.25336838417563e-05, | |
| "loss": 0.7693, | |
| "mean_token_accuracy": 0.7819558610518773, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 2.41128168511246, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 5.1804027604655995e-05, | |
| "loss": 0.7901, | |
| "mean_token_accuracy": 0.7765521576007207, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 2.415565869332381, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 5.1078887950226084e-05, | |
| "loss": 0.8054, | |
| "mean_token_accuracy": 0.7711812168359756, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 2.419850053552303, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 5.035828140328852e-05, | |
| "loss": 0.7997, | |
| "mean_token_accuracy": 0.7752634723981221, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.424134237772224, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 4.964222438536295e-05, | |
| "loss": 0.7682, | |
| "mean_token_accuracy": 0.7835259586572647, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 2.428418421992146, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 4.8930733214292227e-05, | |
| "loss": 0.7798, | |
| "mean_token_accuracy": 0.7836492071549098, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 2.432702606212067, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 4.822382410387027e-05, | |
| "loss": 0.7744, | |
| "mean_token_accuracy": 0.7835030923287074, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 2.4369867904319884, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 4.752151316347328e-05, | |
| "loss": 0.7777, | |
| "mean_token_accuracy": 0.7836812446514766, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 2.44127097465191, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 4.682381639769195e-05, | |
| "loss": 0.7843, | |
| "mean_token_accuracy": 0.7796516954898834, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.4455551588718314, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 4.6130749705966924e-05, | |
| "loss": 0.7856, | |
| "mean_token_accuracy": 0.7769708921511967, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 2.449839343091753, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 4.544232888222674e-05, | |
| "loss": 0.7558, | |
| "mean_token_accuracy": 0.7886910130580266, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 2.4541235273116744, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 4.475856961452765e-05, | |
| "loss": 0.7777, | |
| "mean_token_accuracy": 0.7824478884538014, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 2.4584077115315957, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 4.407948748469615e-05, | |
| "loss": 0.7647, | |
| "mean_token_accuracy": 0.7870442191759746, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 2.4626918957515174, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 4.340509796797401e-05, | |
| "loss": 0.8013, | |
| "mean_token_accuracy": 0.7732500980297724, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.4669760799714386, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 4.273541643266537e-05, | |
| "loss": 0.7809, | |
| "mean_token_accuracy": 0.7783005088567734, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 2.4712602641913604, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 4.2070458139786886e-05, | |
| "loss": 0.7664, | |
| "mean_token_accuracy": 0.7849383483330409, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 2.4755444484112816, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.141023824271964e-05, | |
| "loss": 0.7862, | |
| "mean_token_accuracy": 0.7778257886568706, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 2.479828632631203, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.075477178686382e-05, | |
| "loss": 0.7558, | |
| "mean_token_accuracy": 0.7865474035342535, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 2.4841128168511246, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 4.010407370929634e-05, | |
| "loss": 0.7903, | |
| "mean_token_accuracy": 0.776697979370753, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.488397001071046, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 3.94581588384296e-05, | |
| "loss": 0.7905, | |
| "mean_token_accuracy": 0.7786333004633585, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 2.4926811852909676, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 3.8817041893674407e-05, | |
| "loss": 0.7815, | |
| "mean_token_accuracy": 0.7784610877434412, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 2.496965369510889, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 3.818073748510406e-05, | |
| "loss": 0.7908, | |
| "mean_token_accuracy": 0.7784650901953379, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 2.50124955373081, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 3.754926011312137e-05, | |
| "loss": 0.7778, | |
| "mean_token_accuracy": 0.7789822975794475, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 2.505533737950732, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 3.692262416812869e-05, | |
| "loss": 0.7799, | |
| "mean_token_accuracy": 0.7808815310398738, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.5098179221706536, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 3.63008439301995e-05, | |
| "loss": 0.7668, | |
| "mean_token_accuracy": 0.7802990694840749, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 2.514102106390575, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 3.568393356875305e-05, | |
| "loss": 0.7787, | |
| "mean_token_accuracy": 0.7813133666912715, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 2.518386290610496, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 3.507190714223168e-05, | |
| "loss": 0.7769, | |
| "mean_token_accuracy": 0.7826229612032573, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 2.522670474830418, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 3.446477859778041e-05, | |
| "loss": 0.78, | |
| "mean_token_accuracy": 0.7789117127656937, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 2.526954659050339, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 3.386256177092886e-05, | |
| "loss": 0.7882, | |
| "mean_token_accuracy": 0.7778934846321742, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.531238843270261, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 3.3265270385276296e-05, | |
| "loss": 0.7719, | |
| "mean_token_accuracy": 0.7834851761658986, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 2.535523027490182, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 3.267291805217851e-05, | |
| "loss": 0.7863, | |
| "mean_token_accuracy": 0.7806941578785579, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 2.5398072117101034, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 3.208551827043804e-05, | |
| "loss": 0.7656, | |
| "mean_token_accuracy": 0.7854113181432089, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 2.544091395930025, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 3.150308442599631e-05, | |
| "loss": 0.7658, | |
| "mean_token_accuracy": 0.783687628308932, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 2.5483755801499464, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 3.092562979162864e-05, | |
| "loss": 0.8225, | |
| "mean_token_accuracy": 0.7703000376621882, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.552659764369868, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 3.0353167526641745e-05, | |
| "loss": 0.7792, | |
| "mean_token_accuracy": 0.7798318127791087, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 2.5569439485897894, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 2.97857106765739e-05, | |
| "loss": 0.7706, | |
| "mean_token_accuracy": 0.7838211774826049, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 2.5612281328097106, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.9223272172897607e-05, | |
| "loss": 0.7815, | |
| "mean_token_accuracy": 0.7787001341581344, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 2.5655123170296323, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 2.866586483272507e-05, | |
| "loss": 0.7745, | |
| "mean_token_accuracy": 0.780657422542572, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 2.5697965012495536, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 2.8113501358515813e-05, | |
| "loss": 0.7755, | |
| "mean_token_accuracy": 0.7821098357439041, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.5740806854694753, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 2.7566194337787507e-05, | |
| "loss": 0.8095, | |
| "mean_token_accuracy": 0.7710189620653788, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 2.5783648696893966, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 2.7023956242828968e-05, | |
| "loss": 0.7781, | |
| "mean_token_accuracy": 0.7805722614129385, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 2.582649053909318, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 2.6486799430415875e-05, | |
| "loss": 0.7774, | |
| "mean_token_accuracy": 0.7799272914727529, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 2.5869332381292396, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 2.595473614152932e-05, | |
| "loss": 0.7781, | |
| "mean_token_accuracy": 0.7814104864994685, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 2.591217422349161, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 2.5427778501076804e-05, | |
| "loss": 0.7885, | |
| "mean_token_accuracy": 0.7791467080513637, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.5955016065690826, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 2.490593851761591e-05, | |
| "loss": 0.7689, | |
| "mean_token_accuracy": 0.7836964478095373, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 2.599785790789004, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 2.4389228083080722e-05, | |
| "loss": 0.8215, | |
| "mean_token_accuracy": 0.7677028367916743, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 2.604069975008925, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 2.387765897251057e-05, | |
| "loss": 0.7749, | |
| "mean_token_accuracy": 0.7838013221820196, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 2.608354159228847, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 2.3371242843782088e-05, | |
| "loss": 0.7725, | |
| "mean_token_accuracy": 0.7843282967805862, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 2.6126383434487686, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 2.2869991237343207e-05, | |
| "loss": 0.797, | |
| "mean_token_accuracy": 0.7766178419192632, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.61692252766869, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 2.237391557595042e-05, | |
| "loss": 0.7642, | |
| "mean_token_accuracy": 0.7866159160931905, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 2.621206711888611, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 2.188302716440832e-05, | |
| "loss": 0.7788, | |
| "mean_token_accuracy": 0.7817911605040232, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 2.625490896108533, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 2.1397337189311915e-05, | |
| "loss": 0.7902, | |
| "mean_token_accuracy": 0.7791908890008926, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 2.629775080328454, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 2.091685671879187e-05, | |
| "loss": 0.7905, | |
| "mean_token_accuracy": 0.7808248698711395, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 2.634059264548376, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 2.044159670226245e-05, | |
| "loss": 0.7572, | |
| "mean_token_accuracy": 0.7873713413874308, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.638343448768297, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 1.9971567970171355e-05, | |
| "loss": 0.7987, | |
| "mean_token_accuracy": 0.7766677429278691, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 2.6426276329882183, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 1.95067812337536e-05, | |
| "loss": 0.7688, | |
| "mean_token_accuracy": 0.7839142779509226, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 2.64691181720814, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 1.9047247084787112e-05, | |
| "loss": 0.7602, | |
| "mean_token_accuracy": 0.7852412790060044, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 2.6511960014280613, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.8592975995351257e-05, | |
| "loss": 0.762, | |
| "mean_token_accuracy": 0.7860971083243687, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 2.655480185647983, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 1.81439783175886e-05, | |
| "loss": 0.7658, | |
| "mean_token_accuracy": 0.7843163589636485, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.6597643698679043, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 1.7700264283468465e-05, | |
| "loss": 0.7793, | |
| "mean_token_accuracy": 0.7768728653589885, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 2.6640485540878256, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 1.7261844004554105e-05, | |
| "loss": 0.7862, | |
| "mean_token_accuracy": 0.7782338261604309, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 2.6683327383077473, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 1.6828727471772358e-05, | |
| "loss": 0.7873, | |
| "mean_token_accuracy": 0.7764876892169317, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 2.6726169225276686, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1.6400924555185492e-05, | |
| "loss": 0.7774, | |
| "mean_token_accuracy": 0.7807328204313914, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 2.6769011067475903, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 1.5978445003766968e-05, | |
| "loss": 0.786, | |
| "mean_token_accuracy": 0.7790028562148412, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.6811852909675116, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 1.5561298445178617e-05, | |
| "loss": 0.7511, | |
| "mean_token_accuracy": 0.7876404513915379, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 2.685469475187433, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 1.5149494385551688e-05, | |
| "loss": 0.781, | |
| "mean_token_accuracy": 0.7786963661511739, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 2.6897536594073546, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.474304220927003e-05, | |
| "loss": 0.7837, | |
| "mean_token_accuracy": 0.780462098121643, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 2.694037843627276, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.4341951178756168e-05, | |
| "loss": 0.7817, | |
| "mean_token_accuracy": 0.7801312992970149, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 2.6983220278471975, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 1.3946230434260493e-05, | |
| "loss": 0.7837, | |
| "mean_token_accuracy": 0.7814756115277608, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.702606212067119, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1.3555888993652732e-05, | |
| "loss": 0.7825, | |
| "mean_token_accuracy": 0.7780475705862046, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 2.70689039628704, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 1.3170935752216423e-05, | |
| "loss": 0.7856, | |
| "mean_token_accuracy": 0.7791298975547155, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 2.711174580506962, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.2791379482446407e-05, | |
| "loss": 0.7796, | |
| "mean_token_accuracy": 0.7799790789683659, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 2.7154587647268835, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 1.2417228833848798e-05, | |
| "loss": 0.7748, | |
| "mean_token_accuracy": 0.7825185318787893, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 2.719742948946805, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 1.2048492332743827e-05, | |
| "loss": 0.7963, | |
| "mean_token_accuracy": 0.7784641563892365, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.724027133166726, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 1.1685178382071698e-05, | |
| "loss": 0.7778, | |
| "mean_token_accuracy": 0.7813698927561442, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 2.728311317386648, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 1.1327295261200826e-05, | |
| "loss": 0.7605, | |
| "mean_token_accuracy": 0.7861085881789526, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 2.732595501606569, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 1.0974851125739483e-05, | |
| "loss": 0.7647, | |
| "mean_token_accuracy": 0.7843359092871348, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 2.7368796858264908, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.0627854007349725e-05, | |
| "loss": 0.7771, | |
| "mean_token_accuracy": 0.7815760542949041, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 2.741163870046412, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 1.0286311813564487e-05, | |
| "loss": 0.7819, | |
| "mean_token_accuracy": 0.7804714103539785, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.7454480542663333, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 9.950232327607278e-06, | |
| "loss": 0.7887, | |
| "mean_token_accuracy": 0.7779040902853012, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 2.749732238486255, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 9.619623208214801e-06, | |
| "loss": 0.7795, | |
| "mean_token_accuracy": 0.7815526028474172, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 2.7540164227061763, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 9.29449198946264e-06, | |
| "loss": 0.7577, | |
| "mean_token_accuracy": 0.7885059833526611, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 2.758300606926098, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 8.974846080593262e-06, | |
| "loss": 0.7688, | |
| "mean_token_accuracy": 0.7825045188268026, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 2.7625847911460193, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 8.66069276584741e-06, | |
| "loss": 0.7639, | |
| "mean_token_accuracy": 0.7843573103348415, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.7668689753659406, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 8.352039204298029e-06, | |
| "loss": 0.7457, | |
| "mean_token_accuracy": 0.7897063046693802, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 2.7711531595858623, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 8.048892429687066e-06, | |
| "loss": 0.7661, | |
| "mean_token_accuracy": 0.7825449128945668, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 2.7754373438057836, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 7.751259350265216e-06, | |
| "loss": 0.7502, | |
| "mean_token_accuracy": 0.7883096744616827, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 2.7797215280257053, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 7.459146748634516e-06, | |
| "loss": 0.7681, | |
| "mean_token_accuracy": 0.7826910416285197, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 2.7840057122456265, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 7.172561281593798e-06, | |
| "loss": 0.7842, | |
| "mean_token_accuracy": 0.7800470501184463, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.788289896465548, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 6.891509479986957e-06, | |
| "loss": 0.7857, | |
| "mean_token_accuracy": 0.7821651329596837, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 2.7925740806854695, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 6.615997748554148e-06, | |
| "loss": 0.7631, | |
| "mean_token_accuracy": 0.7860769232114156, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 2.796858264905391, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 6.346032365785709e-06, | |
| "loss": 0.8009, | |
| "mean_token_accuracy": 0.7765180408954621, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 2.8011424491253125, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 6.081619483779277e-06, | |
| "loss": 0.7809, | |
| "mean_token_accuracy": 0.7795830368995667, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 2.805426633345234, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 5.822765128099483e-06, | |
| "loss": 0.764, | |
| "mean_token_accuracy": 0.7845493823289871, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.809710817565155, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 5.569475197640672e-06, | |
| "loss": 0.7747, | |
| "mean_token_accuracy": 0.780571989218394, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 2.8139950017850768, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 5.321755464492456e-06, | |
| "loss": 0.7897, | |
| "mean_token_accuracy": 0.7762352069218953, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 2.818279186004998, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 5.079611573808124e-06, | |
| "loss": 0.7919, | |
| "mean_token_accuracy": 0.7770307193199794, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 2.8225633702249198, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 4.843049043676079e-06, | |
| "loss": 0.7841, | |
| "mean_token_accuracy": 0.7784975161155064, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 2.826847554444841, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 4.61207326499416e-06, | |
| "loss": 0.8001, | |
| "mean_token_accuracy": 0.7755351980527242, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.8311317386647623, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 4.386689501346574e-06, | |
| "loss": 0.8, | |
| "mean_token_accuracy": 0.7723700026671092, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 2.835415922884684, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.166902888884105e-06, | |
| "loss": 0.7928, | |
| "mean_token_accuracy": 0.7796116421620051, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 2.8397001071046057, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 3.952718436207065e-06, | |
| "loss": 0.7813, | |
| "mean_token_accuracy": 0.7800139904022216, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 2.843984291324527, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 3.7441410242510796e-06, | |
| "loss": 0.7706, | |
| "mean_token_accuracy": 0.7841657598813375, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 2.8482684755444483, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 3.5411754061759614e-06, | |
| "loss": 0.7877, | |
| "mean_token_accuracy": 0.7804912636677425, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.85255265976437, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 3.3438262072572612e-06, | |
| "loss": 0.7821, | |
| "mean_token_accuracy": 0.7790777862071991, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 2.8568368439842913, | |
| "grad_norm": 0.4140625, | |
| "learning_rate": 3.1520979247810223e-06, | |
| "loss": 0.7877, | |
| "mean_token_accuracy": 0.7802624036868413, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 2.861121028204213, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 2.9659949279411404e-06, | |
| "loss": 0.7736, | |
| "mean_token_accuracy": 0.7815846174955368, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 2.8654052124241343, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 2.785521457739859e-06, | |
| "loss": 0.7598, | |
| "mean_token_accuracy": 0.7873433222373326, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 2.8696893966440555, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 2.610681626891237e-06, | |
| "loss": 0.773, | |
| "mean_token_accuracy": 0.7824377208948136, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.8739735808639773, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 2.4414794197272217e-06, | |
| "loss": 0.8005, | |
| "mean_token_accuracy": 0.7779265910387039, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 2.8782577650838985, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 2.277918692106973e-06, | |
| "loss": 0.7652, | |
| "mean_token_accuracy": 0.7827598293622334, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 2.8825419493038202, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 2.120003171328988e-06, | |
| "loss": 0.8107, | |
| "mean_token_accuracy": 0.7738327503204345, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 2.8868261335237415, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 1.9677364560460874e-06, | |
| "loss": 0.7964, | |
| "mean_token_accuracy": 0.7783457924922307, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 2.891110317743663, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 1.8211220161835629e-06, | |
| "loss": 0.7952, | |
| "mean_token_accuracy": 0.778786172469457, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.8953945019635845, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 1.6801631928599626e-06, | |
| "loss": 0.7956, | |
| "mean_token_accuracy": 0.7760281264781952, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 2.8996786861835058, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 1.5448631983109584e-06, | |
| "loss": 0.7765, | |
| "mean_token_accuracy": 0.7821375171343485, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 2.9039628704034275, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 1.41522511581621e-06, | |
| "loss": 0.7765, | |
| "mean_token_accuracy": 0.7809682806332906, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 2.9082470546233488, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.2912518996290866e-06, | |
| "loss": 0.7705, | |
| "mean_token_accuracy": 0.786265421907107, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 2.91253123884327, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 1.1729463749093338e-06, | |
| "loss": 0.7982, | |
| "mean_token_accuracy": 0.7765524844328563, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.9168154230631917, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 1.060311237658651e-06, | |
| "loss": 0.773, | |
| "mean_token_accuracy": 0.7809695929288865, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 2.921099607283113, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 9.533490546593248e-07, | |
| "loss": 0.774, | |
| "mean_token_accuracy": 0.7783287167549133, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 2.9253837915030347, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 8.520622634156927e-07, | |
| "loss": 0.8114, | |
| "mean_token_accuracy": 0.7732517321904501, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 2.929667975722956, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 7.564531720985756e-07, | |
| "loss": 0.7683, | |
| "mean_token_accuracy": 0.7860218505064647, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 2.9339521599428773, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 6.665239594927929e-07, | |
| "loss": 0.7902, | |
| "mean_token_accuracy": 0.780942557255427, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.938236344162799, | |
| "grad_norm": 0.3203125, | |
| "learning_rate": 5.82276674947313e-07, | |
| "loss": 0.7628, | |
| "mean_token_accuracy": 0.7842933177947998, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 2.9425205283827207, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 5.037132383287624e-07, | |
| "loss": 0.7806, | |
| "mean_token_accuracy": 0.7809433400630951, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 2.946804712602642, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 4.308354399775172e-07, | |
| "loss": 0.7796, | |
| "mean_token_accuracy": 0.7829487164815266, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 2.9510888968225633, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 3.636449406670128e-07, | |
| "loss": 0.7785, | |
| "mean_token_accuracy": 0.7833139419555664, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 2.955373081042485, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 3.021432715658023e-07, | |
| "loss": 0.776, | |
| "mean_token_accuracy": 0.7816599200169245, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.9596572652624062, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 2.4633183420275093e-07, | |
| "loss": 0.7895, | |
| "mean_token_accuracy": 0.7800428807735443, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 2.963941449482328, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 1.9621190043506155e-07, | |
| "loss": 0.7643, | |
| "mean_token_accuracy": 0.7820758432149887, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 2.9682256337022492, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 1.5178461241918684e-07, | |
| "loss": 0.7962, | |
| "mean_token_accuracy": 0.7780212819576263, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 2.9725098179221705, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 1.1305098258504454e-07, | |
| "loss": 0.7694, | |
| "mean_token_accuracy": 0.7845108538866044, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 2.976794002142092, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 8.001189361273032e-08, | |
| "loss": 0.7709, | |
| "mean_token_accuracy": 0.7825704693794251, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.9810781863620135, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 5.266809841247833e-08, | |
| "loss": 0.7737, | |
| "mean_token_accuracy": 0.7819208929936091, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 2.985362370581935, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 3.1020220107480513e-08, | |
| "loss": 0.7593, | |
| "mean_token_accuracy": 0.7868289381265641, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 2.9896465548018565, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1.506875201975899e-08, | |
| "loss": 0.8075, | |
| "mean_token_accuracy": 0.7752761413653692, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 2.9939307390217778, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 4.8140576588140415e-09, | |
| "loss": 0.7801, | |
| "mean_token_accuracy": 0.7793139110008875, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 2.9982149232416995, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 2.5637071346396034e-10, | |
| "loss": 0.7702, | |
| "mean_token_accuracy": 0.7825540274381637, | |
| "step": 7000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 7002, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3733801424314171e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |