| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.994690265486725, |
| "eval_steps": 500, |
| "global_step": 4235, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.011799410029498525, |
| "grad_norm": 5.65625, |
| "learning_rate": 1.061320754716981e-08, |
| "loss": 0.6804, |
| "mean_token_accuracy": 0.8077195569872856, |
| "num_tokens": 20623.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.02359882005899705, |
| "grad_norm": 5.625, |
| "learning_rate": 2.2405660377358488e-08, |
| "loss": 0.723, |
| "mean_token_accuracy": 0.7997027933597565, |
| "num_tokens": 41642.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.035398230088495575, |
| "grad_norm": 6.15625, |
| "learning_rate": 3.4198113207547165e-08, |
| "loss": 0.6885, |
| "mean_token_accuracy": 0.8035591304302215, |
| "num_tokens": 62462.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0471976401179941, |
| "grad_norm": 5.9375, |
| "learning_rate": 4.5990566037735846e-08, |
| "loss": 0.6987, |
| "mean_token_accuracy": 0.8067316144704819, |
| "num_tokens": 84115.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.058997050147492625, |
| "grad_norm": 5.28125, |
| "learning_rate": 5.7783018867924526e-08, |
| "loss": 0.6904, |
| "mean_token_accuracy": 0.8041402399539948, |
| "num_tokens": 105332.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.07079646017699115, |
| "grad_norm": 5.65625, |
| "learning_rate": 6.95754716981132e-08, |
| "loss": 0.6955, |
| "mean_token_accuracy": 0.807902130484581, |
| "num_tokens": 126409.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.08259587020648967, |
| "grad_norm": 4.25, |
| "learning_rate": 8.136792452830188e-08, |
| "loss": 0.6805, |
| "mean_token_accuracy": 0.8092467188835144, |
| "num_tokens": 148220.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.0943952802359882, |
| "grad_norm": 6.9375, |
| "learning_rate": 9.316037735849056e-08, |
| "loss": 0.6778, |
| "mean_token_accuracy": 0.8039914906024933, |
| "num_tokens": 169303.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.10619469026548672, |
| "grad_norm": 5.625, |
| "learning_rate": 1.0495283018867924e-07, |
| "loss": 0.6609, |
| "mean_token_accuracy": 0.8101286813616753, |
| "num_tokens": 190257.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.11799410029498525, |
| "grad_norm": 6.0625, |
| "learning_rate": 1.1674528301886792e-07, |
| "loss": 0.7411, |
| "mean_token_accuracy": 0.7984047293663025, |
| "num_tokens": 211085.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.12979351032448377, |
| "grad_norm": 4.90625, |
| "learning_rate": 1.2853773584905662e-07, |
| "loss": 0.7285, |
| "mean_token_accuracy": 0.7961118310689926, |
| "num_tokens": 232308.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.1415929203539823, |
| "grad_norm": 5.28125, |
| "learning_rate": 1.4033018867924528e-07, |
| "loss": 0.7332, |
| "mean_token_accuracy": 0.7981508865952491, |
| "num_tokens": 253244.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.15339233038348082, |
| "grad_norm": 5.4375, |
| "learning_rate": 1.5212264150943395e-07, |
| "loss": 0.6549, |
| "mean_token_accuracy": 0.8104467928409577, |
| "num_tokens": 274351.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.16519174041297935, |
| "grad_norm": 5.71875, |
| "learning_rate": 1.6391509433962264e-07, |
| "loss": 0.653, |
| "mean_token_accuracy": 0.8155501663684845, |
| "num_tokens": 294930.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.17699115044247787, |
| "grad_norm": 5.125, |
| "learning_rate": 1.757075471698113e-07, |
| "loss": 0.699, |
| "mean_token_accuracy": 0.8086191982030868, |
| "num_tokens": 314808.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.1887905604719764, |
| "grad_norm": 5.28125, |
| "learning_rate": 1.875e-07, |
| "loss": 0.6899, |
| "mean_token_accuracy": 0.8029965132474899, |
| "num_tokens": 335816.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.20058997050147492, |
| "grad_norm": 4.75, |
| "learning_rate": 1.9929245283018867e-07, |
| "loss": 0.7358, |
| "mean_token_accuracy": 0.8015004798769951, |
| "num_tokens": 357321.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.21238938053097345, |
| "grad_norm": 5.46875, |
| "learning_rate": 2.1108490566037734e-07, |
| "loss": 0.6692, |
| "mean_token_accuracy": 0.8106805741786957, |
| "num_tokens": 378661.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.22418879056047197, |
| "grad_norm": 5.1875, |
| "learning_rate": 2.2287735849056603e-07, |
| "loss": 0.7268, |
| "mean_token_accuracy": 0.8009970590472222, |
| "num_tokens": 400365.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.2359882005899705, |
| "grad_norm": 5.75, |
| "learning_rate": 2.346698113207547e-07, |
| "loss": 0.6448, |
| "mean_token_accuracy": 0.810512238740921, |
| "num_tokens": 422850.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.24778761061946902, |
| "grad_norm": 6.0625, |
| "learning_rate": 2.464622641509434e-07, |
| "loss": 0.6857, |
| "mean_token_accuracy": 0.8057759568095207, |
| "num_tokens": 443782.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.25958702064896755, |
| "grad_norm": 5.5, |
| "learning_rate": 2.5825471698113206e-07, |
| "loss": 0.6802, |
| "mean_token_accuracy": 0.8078009814023972, |
| "num_tokens": 464551.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.2713864306784661, |
| "grad_norm": 5.1875, |
| "learning_rate": 2.7004716981132073e-07, |
| "loss": 0.6731, |
| "mean_token_accuracy": 0.808232493698597, |
| "num_tokens": 485551.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.2831858407079646, |
| "grad_norm": 5.84375, |
| "learning_rate": 2.818396226415094e-07, |
| "loss": 0.6548, |
| "mean_token_accuracy": 0.81433225274086, |
| "num_tokens": 506869.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.2949852507374631, |
| "grad_norm": 6.1875, |
| "learning_rate": 2.936320754716981e-07, |
| "loss": 0.7341, |
| "mean_token_accuracy": 0.7938252255320549, |
| "num_tokens": 527735.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.30678466076696165, |
| "grad_norm": 5.03125, |
| "learning_rate": 3.0542452830188673e-07, |
| "loss": 0.6604, |
| "mean_token_accuracy": 0.815295147895813, |
| "num_tokens": 548973.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3185840707964602, |
| "grad_norm": 5.5625, |
| "learning_rate": 3.1721698113207545e-07, |
| "loss": 0.7054, |
| "mean_token_accuracy": 0.8024254336953163, |
| "num_tokens": 569628.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.3303834808259587, |
| "grad_norm": 4.96875, |
| "learning_rate": 3.2900943396226417e-07, |
| "loss": 0.7095, |
| "mean_token_accuracy": 0.7986711695790291, |
| "num_tokens": 590252.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.3421828908554572, |
| "grad_norm": 5.84375, |
| "learning_rate": 3.408018867924528e-07, |
| "loss": 0.7079, |
| "mean_token_accuracy": 0.8044031783938408, |
| "num_tokens": 611537.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.35398230088495575, |
| "grad_norm": 4.9375, |
| "learning_rate": 3.525943396226415e-07, |
| "loss": 0.7298, |
| "mean_token_accuracy": 0.7983710408210755, |
| "num_tokens": 631872.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.36578171091445427, |
| "grad_norm": 6.15625, |
| "learning_rate": 3.6438679245283017e-07, |
| "loss": 0.6706, |
| "mean_token_accuracy": 0.8168817177414894, |
| "num_tokens": 653470.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.3775811209439528, |
| "grad_norm": 5.0625, |
| "learning_rate": 3.7617924528301884e-07, |
| "loss": 0.6877, |
| "mean_token_accuracy": 0.8093630835413933, |
| "num_tokens": 675898.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.3893805309734513, |
| "grad_norm": 5.8125, |
| "learning_rate": 3.879716981132075e-07, |
| "loss": 0.6969, |
| "mean_token_accuracy": 0.8038288667798043, |
| "num_tokens": 696970.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.40117994100294985, |
| "grad_norm": 4.5625, |
| "learning_rate": 3.9976415094339623e-07, |
| "loss": 0.685, |
| "mean_token_accuracy": 0.8051398575305939, |
| "num_tokens": 717592.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.41297935103244837, |
| "grad_norm": 5.0625, |
| "learning_rate": 4.1155660377358484e-07, |
| "loss": 0.6326, |
| "mean_token_accuracy": 0.8189736142754555, |
| "num_tokens": 739533.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4247787610619469, |
| "grad_norm": 6.9375, |
| "learning_rate": 4.2334905660377356e-07, |
| "loss": 0.6886, |
| "mean_token_accuracy": 0.807501520216465, |
| "num_tokens": 760905.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.4365781710914454, |
| "grad_norm": 6.125, |
| "learning_rate": 4.3514150943396223e-07, |
| "loss": 0.6725, |
| "mean_token_accuracy": 0.8037113741040229, |
| "num_tokens": 781198.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.44837758112094395, |
| "grad_norm": 5.90625, |
| "learning_rate": 4.469339622641509e-07, |
| "loss": 0.6651, |
| "mean_token_accuracy": 0.8105659380555152, |
| "num_tokens": 803462.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.46017699115044247, |
| "grad_norm": 4.96875, |
| "learning_rate": 4.587264150943396e-07, |
| "loss": 0.687, |
| "mean_token_accuracy": 0.8103244379162788, |
| "num_tokens": 824643.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.471976401179941, |
| "grad_norm": 5.84375, |
| "learning_rate": 4.705188679245283e-07, |
| "loss": 0.674, |
| "mean_token_accuracy": 0.8083027333021164, |
| "num_tokens": 845624.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4837758112094395, |
| "grad_norm": 6.375, |
| "learning_rate": 4.82311320754717e-07, |
| "loss": 0.7174, |
| "mean_token_accuracy": 0.8018373742699623, |
| "num_tokens": 866719.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.49557522123893805, |
| "grad_norm": 5.53125, |
| "learning_rate": 4.941037735849057e-07, |
| "loss": 0.7443, |
| "mean_token_accuracy": 0.7902135103940964, |
| "num_tokens": 887168.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5073746312684366, |
| "grad_norm": 4.8125, |
| "learning_rate": 5.058962264150943e-07, |
| "loss": 0.66, |
| "mean_token_accuracy": 0.8109032705426216, |
| "num_tokens": 908747.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.5191740412979351, |
| "grad_norm": 4.3125, |
| "learning_rate": 5.17688679245283e-07, |
| "loss": 0.6913, |
| "mean_token_accuracy": 0.8099235415458679, |
| "num_tokens": 930369.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.5309734513274337, |
| "grad_norm": 5.0, |
| "learning_rate": 5.294811320754716e-07, |
| "loss": 0.6729, |
| "mean_token_accuracy": 0.8100858047604561, |
| "num_tokens": 950934.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.5427728613569321, |
| "grad_norm": 5.90625, |
| "learning_rate": 5.412735849056603e-07, |
| "loss": 0.6705, |
| "mean_token_accuracy": 0.8141911312937736, |
| "num_tokens": 972368.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.5545722713864307, |
| "grad_norm": 6.34375, |
| "learning_rate": 5.530660377358491e-07, |
| "loss": 0.6783, |
| "mean_token_accuracy": 0.8158679232001305, |
| "num_tokens": 992960.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.5663716814159292, |
| "grad_norm": 7.5, |
| "learning_rate": 5.648584905660378e-07, |
| "loss": 0.6932, |
| "mean_token_accuracy": 0.80316391736269, |
| "num_tokens": 1013962.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.5781710914454278, |
| "grad_norm": 6.0625, |
| "learning_rate": 5.766509433962264e-07, |
| "loss": 0.6513, |
| "mean_token_accuracy": 0.8126063928008079, |
| "num_tokens": 1034704.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.5899705014749262, |
| "grad_norm": 5.8125, |
| "learning_rate": 5.88443396226415e-07, |
| "loss": 0.7526, |
| "mean_token_accuracy": 0.793866828083992, |
| "num_tokens": 1054763.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6017699115044248, |
| "grad_norm": 5.4375, |
| "learning_rate": 6.002358490566037e-07, |
| "loss": 0.6659, |
| "mean_token_accuracy": 0.8127147316932678, |
| "num_tokens": 1076182.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.6135693215339233, |
| "grad_norm": 6.4375, |
| "learning_rate": 6.120283018867924e-07, |
| "loss": 0.6692, |
| "mean_token_accuracy": 0.8141843125224113, |
| "num_tokens": 1097013.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.6253687315634219, |
| "grad_norm": 5.5625, |
| "learning_rate": 6.238207547169812e-07, |
| "loss": 0.718, |
| "mean_token_accuracy": 0.7995802894234657, |
| "num_tokens": 1118341.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.6371681415929203, |
| "grad_norm": 4.40625, |
| "learning_rate": 6.356132075471698e-07, |
| "loss": 0.6832, |
| "mean_token_accuracy": 0.8065086260437966, |
| "num_tokens": 1139330.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.6489675516224189, |
| "grad_norm": 6.375, |
| "learning_rate": 6.474056603773584e-07, |
| "loss": 0.6614, |
| "mean_token_accuracy": 0.8176608860492707, |
| "num_tokens": 1161180.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.6607669616519174, |
| "grad_norm": 4.96875, |
| "learning_rate": 6.591981132075471e-07, |
| "loss": 0.6642, |
| "mean_token_accuracy": 0.8148991391062737, |
| "num_tokens": 1183943.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.672566371681416, |
| "grad_norm": 6.03125, |
| "learning_rate": 6.709905660377358e-07, |
| "loss": 0.6872, |
| "mean_token_accuracy": 0.8144838407635688, |
| "num_tokens": 1206351.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.6843657817109144, |
| "grad_norm": 4.4375, |
| "learning_rate": 6.827830188679245e-07, |
| "loss": 0.7209, |
| "mean_token_accuracy": 0.8057972431182862, |
| "num_tokens": 1228876.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.696165191740413, |
| "grad_norm": 6.9375, |
| "learning_rate": 6.945754716981132e-07, |
| "loss": 0.6914, |
| "mean_token_accuracy": 0.799898374080658, |
| "num_tokens": 1249016.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.7079646017699115, |
| "grad_norm": 4.59375, |
| "learning_rate": 7.063679245283019e-07, |
| "loss": 0.6422, |
| "mean_token_accuracy": 0.8160789251327515, |
| "num_tokens": 1270121.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.7197640117994101, |
| "grad_norm": 4.65625, |
| "learning_rate": 7.181603773584905e-07, |
| "loss": 0.6617, |
| "mean_token_accuracy": 0.8128387361764908, |
| "num_tokens": 1291378.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.7315634218289085, |
| "grad_norm": 5.59375, |
| "learning_rate": 7.299528301886792e-07, |
| "loss": 0.7247, |
| "mean_token_accuracy": 0.803736099600792, |
| "num_tokens": 1313967.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.7433628318584071, |
| "grad_norm": 5.625, |
| "learning_rate": 7.417452830188678e-07, |
| "loss": 0.6126, |
| "mean_token_accuracy": 0.8258938357234001, |
| "num_tokens": 1335374.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.7551622418879056, |
| "grad_norm": 4.09375, |
| "learning_rate": 7.535377358490566e-07, |
| "loss": 0.6049, |
| "mean_token_accuracy": 0.8208671569824219, |
| "num_tokens": 1357819.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.7669616519174042, |
| "grad_norm": 5.03125, |
| "learning_rate": 7.653301886792453e-07, |
| "loss": 0.6889, |
| "mean_token_accuracy": 0.8041063904762268, |
| "num_tokens": 1380358.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.7787610619469026, |
| "grad_norm": 5.25, |
| "learning_rate": 7.77122641509434e-07, |
| "loss": 0.6362, |
| "mean_token_accuracy": 0.8186793461441993, |
| "num_tokens": 1401353.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.7905604719764012, |
| "grad_norm": 5.53125, |
| "learning_rate": 7.889150943396225e-07, |
| "loss": 0.6305, |
| "mean_token_accuracy": 0.8209337189793586, |
| "num_tokens": 1421882.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.8023598820058997, |
| "grad_norm": 4.15625, |
| "learning_rate": 8.007075471698112e-07, |
| "loss": 0.6693, |
| "mean_token_accuracy": 0.8148546203970909, |
| "num_tokens": 1443885.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.8141592920353983, |
| "grad_norm": 3.90625, |
| "learning_rate": 8.125e-07, |
| "loss": 0.7201, |
| "mean_token_accuracy": 0.8142283573746681, |
| "num_tokens": 1466590.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.8259587020648967, |
| "grad_norm": 4.71875, |
| "learning_rate": 8.242924528301887e-07, |
| "loss": 0.6308, |
| "mean_token_accuracy": 0.8244256362318992, |
| "num_tokens": 1488555.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.8377581120943953, |
| "grad_norm": 4.25, |
| "learning_rate": 8.360849056603774e-07, |
| "loss": 0.6696, |
| "mean_token_accuracy": 0.8095144599676132, |
| "num_tokens": 1510433.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.8495575221238938, |
| "grad_norm": 5.59375, |
| "learning_rate": 8.47877358490566e-07, |
| "loss": 0.6598, |
| "mean_token_accuracy": 0.8179592430591583, |
| "num_tokens": 1530664.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.8613569321533924, |
| "grad_norm": 4.3125, |
| "learning_rate": 8.596698113207546e-07, |
| "loss": 0.6638, |
| "mean_token_accuracy": 0.8198263213038445, |
| "num_tokens": 1552318.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.8731563421828908, |
| "grad_norm": 5.0, |
| "learning_rate": 8.714622641509433e-07, |
| "loss": 0.6713, |
| "mean_token_accuracy": 0.8134427219629288, |
| "num_tokens": 1573048.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.8849557522123894, |
| "grad_norm": 5.3125, |
| "learning_rate": 8.832547169811321e-07, |
| "loss": 0.648, |
| "mean_token_accuracy": 0.8186259895563126, |
| "num_tokens": 1592885.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.8967551622418879, |
| "grad_norm": 5.375, |
| "learning_rate": 8.950471698113207e-07, |
| "loss": 0.6576, |
| "mean_token_accuracy": 0.8126331850886345, |
| "num_tokens": 1614569.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.9085545722713865, |
| "grad_norm": 4.3125, |
| "learning_rate": 9.068396226415094e-07, |
| "loss": 0.6794, |
| "mean_token_accuracy": 0.8158453807234765, |
| "num_tokens": 1634680.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.9203539823008849, |
| "grad_norm": 3.984375, |
| "learning_rate": 9.186320754716981e-07, |
| "loss": 0.6269, |
| "mean_token_accuracy": 0.8237964197993278, |
| "num_tokens": 1656489.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.9321533923303835, |
| "grad_norm": 5.09375, |
| "learning_rate": 9.304245283018867e-07, |
| "loss": 0.6573, |
| "mean_token_accuracy": 0.8234946221113205, |
| "num_tokens": 1679229.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.943952802359882, |
| "grad_norm": 5.3125, |
| "learning_rate": 9.422169811320754e-07, |
| "loss": 0.6174, |
| "mean_token_accuracy": 0.8271913766860962, |
| "num_tokens": 1699784.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.9557522123893806, |
| "grad_norm": 4.71875, |
| "learning_rate": 9.54009433962264e-07, |
| "loss": 0.6321, |
| "mean_token_accuracy": 0.8152614802122116, |
| "num_tokens": 1720484.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.967551622418879, |
| "grad_norm": 4.09375, |
| "learning_rate": 9.658018867924527e-07, |
| "loss": 0.6154, |
| "mean_token_accuracy": 0.8224340006709099, |
| "num_tokens": 1743292.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.9793510324483776, |
| "grad_norm": 4.75, |
| "learning_rate": 9.775943396226415e-07, |
| "loss": 0.6298, |
| "mean_token_accuracy": 0.8266109019517899, |
| "num_tokens": 1765368.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.9911504424778761, |
| "grad_norm": 4.625, |
| "learning_rate": 9.893867924528301e-07, |
| "loss": 0.6407, |
| "mean_token_accuracy": 0.8173819676041603, |
| "num_tokens": 1786367.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.0023598820058996, |
| "grad_norm": 5.3125, |
| "learning_rate": 1.0011792452830187e-06, |
| "loss": 0.6244, |
| "mean_token_accuracy": 0.8294386032380556, |
| "num_tokens": 1805897.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.0141592920353983, |
| "grad_norm": 5.0625, |
| "learning_rate": 1.0129716981132076e-06, |
| "loss": 0.6741, |
| "mean_token_accuracy": 0.8121012166142464, |
| "num_tokens": 1826767.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.0259587020648968, |
| "grad_norm": 4.15625, |
| "learning_rate": 1.0247641509433962e-06, |
| "loss": 0.6994, |
| "mean_token_accuracy": 0.8061724543571472, |
| "num_tokens": 1847473.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.0377581120943953, |
| "grad_norm": 3.578125, |
| "learning_rate": 1.036556603773585e-06, |
| "loss": 0.6488, |
| "mean_token_accuracy": 0.820195910334587, |
| "num_tokens": 1871198.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.0495575221238937, |
| "grad_norm": 3.875, |
| "learning_rate": 1.0483490566037736e-06, |
| "loss": 0.609, |
| "mean_token_accuracy": 0.8251825600862503, |
| "num_tokens": 1893403.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.0613569321533922, |
| "grad_norm": 3.890625, |
| "learning_rate": 1.0601415094339622e-06, |
| "loss": 0.5795, |
| "mean_token_accuracy": 0.8335932061076164, |
| "num_tokens": 1915203.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.073156342182891, |
| "grad_norm": 4.625, |
| "learning_rate": 1.071933962264151e-06, |
| "loss": 0.5872, |
| "mean_token_accuracy": 0.8251020297408104, |
| "num_tokens": 1935687.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.0849557522123894, |
| "grad_norm": 4.96875, |
| "learning_rate": 1.0837264150943395e-06, |
| "loss": 0.6337, |
| "mean_token_accuracy": 0.820446926355362, |
| "num_tokens": 1956282.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.0967551622418878, |
| "grad_norm": 3.65625, |
| "learning_rate": 1.0955188679245283e-06, |
| "loss": 0.6144, |
| "mean_token_accuracy": 0.825304602086544, |
| "num_tokens": 1977878.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.1085545722713865, |
| "grad_norm": 3.859375, |
| "learning_rate": 1.107311320754717e-06, |
| "loss": 0.5684, |
| "mean_token_accuracy": 0.8271389842033386, |
| "num_tokens": 1998711.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.120353982300885, |
| "grad_norm": 3.84375, |
| "learning_rate": 1.1191037735849055e-06, |
| "loss": 0.6226, |
| "mean_token_accuracy": 0.8217311635613441, |
| "num_tokens": 2020451.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.1321533923303835, |
| "grad_norm": 4.03125, |
| "learning_rate": 1.1308962264150943e-06, |
| "loss": 0.5764, |
| "mean_token_accuracy": 0.8377224639058113, |
| "num_tokens": 2042194.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.143952802359882, |
| "grad_norm": 4.96875, |
| "learning_rate": 1.142688679245283e-06, |
| "loss": 0.6044, |
| "mean_token_accuracy": 0.8252048686146736, |
| "num_tokens": 2062931.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.1557522123893804, |
| "grad_norm": 5.03125, |
| "learning_rate": 1.1544811320754718e-06, |
| "loss": 0.6484, |
| "mean_token_accuracy": 0.820095656812191, |
| "num_tokens": 2084541.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.167551622418879, |
| "grad_norm": 3.59375, |
| "learning_rate": 1.1662735849056604e-06, |
| "loss": 0.5882, |
| "mean_token_accuracy": 0.8260459393262863, |
| "num_tokens": 2106895.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.1793510324483776, |
| "grad_norm": 4.21875, |
| "learning_rate": 1.1780660377358488e-06, |
| "loss": 0.5892, |
| "mean_token_accuracy": 0.8302577301859856, |
| "num_tokens": 2126601.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.191150442477876, |
| "grad_norm": 4.21875, |
| "learning_rate": 1.1898584905660376e-06, |
| "loss": 0.596, |
| "mean_token_accuracy": 0.8300721794366837, |
| "num_tokens": 2147651.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.2029498525073747, |
| "grad_norm": 4.125, |
| "learning_rate": 1.2016509433962262e-06, |
| "loss": 0.6265, |
| "mean_token_accuracy": 0.8184733793139458, |
| "num_tokens": 2169367.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.2147492625368732, |
| "grad_norm": 4.65625, |
| "learning_rate": 1.213443396226415e-06, |
| "loss": 0.592, |
| "mean_token_accuracy": 0.8349920228123665, |
| "num_tokens": 2191012.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.2265486725663717, |
| "grad_norm": 4.59375, |
| "learning_rate": 1.2252358490566037e-06, |
| "loss": 0.6034, |
| "mean_token_accuracy": 0.8312716528773307, |
| "num_tokens": 2212939.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.2383480825958701, |
| "grad_norm": 5.0625, |
| "learning_rate": 1.2370283018867925e-06, |
| "loss": 0.5924, |
| "mean_token_accuracy": 0.8298429310321808, |
| "num_tokens": 2234514.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.2501474926253686, |
| "grad_norm": 4.125, |
| "learning_rate": 1.2488207547169811e-06, |
| "loss": 0.6284, |
| "mean_token_accuracy": 0.8235250413417816, |
| "num_tokens": 2255100.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.2619469026548673, |
| "grad_norm": 3.625, |
| "learning_rate": 1.2606132075471697e-06, |
| "loss": 0.6338, |
| "mean_token_accuracy": 0.8273122042417527, |
| "num_tokens": 2277305.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.2737463126843658, |
| "grad_norm": 3.71875, |
| "learning_rate": 1.2724056603773586e-06, |
| "loss": 0.5474, |
| "mean_token_accuracy": 0.8388556599617004, |
| "num_tokens": 2298303.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.2855457227138642, |
| "grad_norm": 4.40625, |
| "learning_rate": 1.284198113207547e-06, |
| "loss": 0.6227, |
| "mean_token_accuracy": 0.8316986605525016, |
| "num_tokens": 2321960.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.297345132743363, |
| "grad_norm": 5.03125, |
| "learning_rate": 1.2959905660377358e-06, |
| "loss": 0.6085, |
| "mean_token_accuracy": 0.829347026348114, |
| "num_tokens": 2342881.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.3091445427728614, |
| "grad_norm": 4.71875, |
| "learning_rate": 1.3077830188679244e-06, |
| "loss": 0.6045, |
| "mean_token_accuracy": 0.8265531584620476, |
| "num_tokens": 2363552.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.3209439528023599, |
| "grad_norm": 4.3125, |
| "learning_rate": 1.3195754716981132e-06, |
| "loss": 0.6285, |
| "mean_token_accuracy": 0.823639677464962, |
| "num_tokens": 2385275.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.3327433628318583, |
| "grad_norm": 3.84375, |
| "learning_rate": 1.3313679245283018e-06, |
| "loss": 0.5949, |
| "mean_token_accuracy": 0.8321825504302979, |
| "num_tokens": 2406666.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.3445427728613568, |
| "grad_norm": 3.6875, |
| "learning_rate": 1.3431603773584905e-06, |
| "loss": 0.5687, |
| "mean_token_accuracy": 0.8341725096106529, |
| "num_tokens": 2427158.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.3563421828908555, |
| "grad_norm": 4.65625, |
| "learning_rate": 1.3549528301886793e-06, |
| "loss": 0.5815, |
| "mean_token_accuracy": 0.837175740301609, |
| "num_tokens": 2448691.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.368141592920354, |
| "grad_norm": 3.9375, |
| "learning_rate": 1.3667452830188679e-06, |
| "loss": 0.583, |
| "mean_token_accuracy": 0.8346440017223358, |
| "num_tokens": 2468769.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.3799410029498524, |
| "grad_norm": 3.59375, |
| "learning_rate": 1.3785377358490567e-06, |
| "loss": 0.5995, |
| "mean_token_accuracy": 0.836300277709961, |
| "num_tokens": 2491054.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.3917404129793511, |
| "grad_norm": 4.03125, |
| "learning_rate": 1.3903301886792451e-06, |
| "loss": 0.5681, |
| "mean_token_accuracy": 0.837888953089714, |
| "num_tokens": 2513210.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.4035398230088496, |
| "grad_norm": 3.375, |
| "learning_rate": 1.4021226415094337e-06, |
| "loss": 0.5836, |
| "mean_token_accuracy": 0.8354374140501022, |
| "num_tokens": 2534603.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.415339233038348, |
| "grad_norm": 3.65625, |
| "learning_rate": 1.4139150943396226e-06, |
| "loss": 0.5832, |
| "mean_token_accuracy": 0.8318114146590233, |
| "num_tokens": 2555517.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.4271386430678465, |
| "grad_norm": 3.65625, |
| "learning_rate": 1.4257075471698112e-06, |
| "loss": 0.5895, |
| "mean_token_accuracy": 0.8386975318193436, |
| "num_tokens": 2577192.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.438938053097345, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.4375e-06, |
| "loss": 0.6422, |
| "mean_token_accuracy": 0.8283508166670799, |
| "num_tokens": 2599141.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.4507374631268437, |
| "grad_norm": 3.390625, |
| "learning_rate": 1.4492924528301886e-06, |
| "loss": 0.6105, |
| "mean_token_accuracy": 0.8319273814558983, |
| "num_tokens": 2619202.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.4625368731563422, |
| "grad_norm": 3.359375, |
| "learning_rate": 1.4610849056603774e-06, |
| "loss": 0.5619, |
| "mean_token_accuracy": 0.8401564568281173, |
| "num_tokens": 2641169.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.4743362831858406, |
| "grad_norm": 3.84375, |
| "learning_rate": 1.472877358490566e-06, |
| "loss": 0.5661, |
| "mean_token_accuracy": 0.8366801410913467, |
| "num_tokens": 2661930.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.4861356932153393, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.4846698113207547e-06, |
| "loss": 0.5693, |
| "mean_token_accuracy": 0.8446071773767472, |
| "num_tokens": 2683299.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.4979351032448378, |
| "grad_norm": 3.15625, |
| "learning_rate": 1.4964622641509433e-06, |
| "loss": 0.5478, |
| "mean_token_accuracy": 0.8425341203808785, |
| "num_tokens": 2703820.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.5097345132743363, |
| "grad_norm": 3.265625, |
| "learning_rate": 1.508254716981132e-06, |
| "loss": 0.5287, |
| "mean_token_accuracy": 0.8478223592042923, |
| "num_tokens": 2724842.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.521533923303835, |
| "grad_norm": 2.890625, |
| "learning_rate": 1.5200471698113207e-06, |
| "loss": 0.5799, |
| "mean_token_accuracy": 0.8372409701347351, |
| "num_tokens": 2745794.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.5333333333333332, |
| "grad_norm": 3.703125, |
| "learning_rate": 1.5318396226415093e-06, |
| "loss": 0.5604, |
| "mean_token_accuracy": 0.8442010298371315, |
| "num_tokens": 2767304.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.545132743362832, |
| "grad_norm": 4.25, |
| "learning_rate": 1.543632075471698e-06, |
| "loss": 0.5932, |
| "mean_token_accuracy": 0.8349510475993156, |
| "num_tokens": 2787486.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.5569321533923304, |
| "grad_norm": 3.578125, |
| "learning_rate": 1.5554245283018868e-06, |
| "loss": 0.5813, |
| "mean_token_accuracy": 0.8361784920096398, |
| "num_tokens": 2808938.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.5687315634218288, |
| "grad_norm": 3.875, |
| "learning_rate": 1.5672169811320754e-06, |
| "loss": 0.553, |
| "mean_token_accuracy": 0.8432978987693787, |
| "num_tokens": 2829935.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.5805309734513275, |
| "grad_norm": 3.125, |
| "learning_rate": 1.5790094339622642e-06, |
| "loss": 0.5927, |
| "mean_token_accuracy": 0.8364956706762314, |
| "num_tokens": 2850325.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.592330383480826, |
| "grad_norm": 3.515625, |
| "learning_rate": 1.5908018867924528e-06, |
| "loss": 0.5882, |
| "mean_token_accuracy": 0.833353728055954, |
| "num_tokens": 2870882.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.6041297935103245, |
| "grad_norm": 3.625, |
| "learning_rate": 1.6025943396226414e-06, |
| "loss": 0.5804, |
| "mean_token_accuracy": 0.8396367952227592, |
| "num_tokens": 2891620.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.6159292035398232, |
| "grad_norm": 3.359375, |
| "learning_rate": 1.61438679245283e-06, |
| "loss": 0.5438, |
| "mean_token_accuracy": 0.8412434130907058, |
| "num_tokens": 2912696.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.6277286135693214, |
| "grad_norm": 3.921875, |
| "learning_rate": 1.6261792452830187e-06, |
| "loss": 0.5468, |
| "mean_token_accuracy": 0.8425421223044396, |
| "num_tokens": 2933087.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.63952802359882, |
| "grad_norm": 4.375, |
| "learning_rate": 1.6379716981132075e-06, |
| "loss": 0.5615, |
| "mean_token_accuracy": 0.8425424665212631, |
| "num_tokens": 2953617.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.6513274336283186, |
| "grad_norm": 3.328125, |
| "learning_rate": 1.6497641509433961e-06, |
| "loss": 0.5907, |
| "mean_token_accuracy": 0.8397417709231376, |
| "num_tokens": 2973285.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.663126843657817, |
| "grad_norm": 2.75, |
| "learning_rate": 1.661556603773585e-06, |
| "loss": 0.5586, |
| "mean_token_accuracy": 0.842847327888012, |
| "num_tokens": 2994051.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.6749262536873157, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.6733490566037736e-06, |
| "loss": 0.5705, |
| "mean_token_accuracy": 0.8413301527500152, |
| "num_tokens": 3015011.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.6867256637168142, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.6851415094339622e-06, |
| "loss": 0.5349, |
| "mean_token_accuracy": 0.848284013569355, |
| "num_tokens": 3036272.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.6985250737463127, |
| "grad_norm": 4.1875, |
| "learning_rate": 1.696933962264151e-06, |
| "loss": 0.5539, |
| "mean_token_accuracy": 0.8459584936499596, |
| "num_tokens": 3057966.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.7103244837758114, |
| "grad_norm": 3.609375, |
| "learning_rate": 1.7087264150943394e-06, |
| "loss": 0.5723, |
| "mean_token_accuracy": 0.8414489448070526, |
| "num_tokens": 3079475.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.7221238938053096, |
| "grad_norm": 3.75, |
| "learning_rate": 1.7205188679245282e-06, |
| "loss": 0.588, |
| "mean_token_accuracy": 0.8319360539317131, |
| "num_tokens": 3100780.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.7339233038348083, |
| "grad_norm": 3.359375, |
| "learning_rate": 1.7323113207547168e-06, |
| "loss": 0.5432, |
| "mean_token_accuracy": 0.8435217276215553, |
| "num_tokens": 3121924.0, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.7457227138643068, |
| "grad_norm": 4.9375, |
| "learning_rate": 1.7441037735849057e-06, |
| "loss": 0.5741, |
| "mean_token_accuracy": 0.8455688193440437, |
| "num_tokens": 3144307.0, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.7575221238938052, |
| "grad_norm": 3.78125, |
| "learning_rate": 1.7558962264150943e-06, |
| "loss": 0.5817, |
| "mean_token_accuracy": 0.8385950595140457, |
| "num_tokens": 3165047.0, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.769321533923304, |
| "grad_norm": 3.15625, |
| "learning_rate": 1.767688679245283e-06, |
| "loss": 0.5772, |
| "mean_token_accuracy": 0.836099736392498, |
| "num_tokens": 3187153.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.7811209439528024, |
| "grad_norm": 3.234375, |
| "learning_rate": 1.7794811320754717e-06, |
| "loss": 0.5298, |
| "mean_token_accuracy": 0.847694669663906, |
| "num_tokens": 3208354.0, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.7929203539823009, |
| "grad_norm": 3.625, |
| "learning_rate": 1.7912735849056603e-06, |
| "loss": 0.4974, |
| "mean_token_accuracy": 0.8575958669185638, |
| "num_tokens": 3229140.0, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.8047197640117996, |
| "grad_norm": 3.265625, |
| "learning_rate": 1.803066037735849e-06, |
| "loss": 0.5668, |
| "mean_token_accuracy": 0.8394325375556946, |
| "num_tokens": 3250803.0, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.8165191740412978, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.8148584905660376e-06, |
| "loss": 0.5511, |
| "mean_token_accuracy": 0.8462499424815177, |
| "num_tokens": 3271253.0, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.8283185840707965, |
| "grad_norm": 3.6875, |
| "learning_rate": 1.8266509433962262e-06, |
| "loss": 0.5171, |
| "mean_token_accuracy": 0.847356554865837, |
| "num_tokens": 3292672.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.840117994100295, |
| "grad_norm": 3.234375, |
| "learning_rate": 1.838443396226415e-06, |
| "loss": 0.5641, |
| "mean_token_accuracy": 0.8406664237380028, |
| "num_tokens": 3314273.0, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.8519174041297934, |
| "grad_norm": 3.34375, |
| "learning_rate": 1.8502358490566036e-06, |
| "loss": 0.5229, |
| "mean_token_accuracy": 0.8563799113035202, |
| "num_tokens": 3335501.0, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.8637168141592921, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.8620283018867924e-06, |
| "loss": 0.5376, |
| "mean_token_accuracy": 0.8446942239999771, |
| "num_tokens": 3357741.0, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.8755162241887906, |
| "grad_norm": 3.03125, |
| "learning_rate": 1.873820754716981e-06, |
| "loss": 0.5583, |
| "mean_token_accuracy": 0.8445750951766968, |
| "num_tokens": 3379338.0, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.887315634218289, |
| "grad_norm": 2.921875, |
| "learning_rate": 1.8856132075471699e-06, |
| "loss": 0.5493, |
| "mean_token_accuracy": 0.8522657513618469, |
| "num_tokens": 3401116.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.8991150442477878, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.8974056603773585e-06, |
| "loss": 0.5406, |
| "mean_token_accuracy": 0.8431693106889725, |
| "num_tokens": 3421976.0, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.910914454277286, |
| "grad_norm": 3.171875, |
| "learning_rate": 1.909198113207547e-06, |
| "loss": 0.5432, |
| "mean_token_accuracy": 0.8524840101599693, |
| "num_tokens": 3444030.0, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.9227138643067847, |
| "grad_norm": 3.8125, |
| "learning_rate": 1.9209905660377355e-06, |
| "loss": 0.5547, |
| "mean_token_accuracy": 0.8424161151051521, |
| "num_tokens": 3465245.0, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.9345132743362832, |
| "grad_norm": 3.125, |
| "learning_rate": 1.9327830188679246e-06, |
| "loss": 0.6043, |
| "mean_token_accuracy": 0.8421701610088348, |
| "num_tokens": 3487470.0, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.9463126843657816, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.944575471698113e-06, |
| "loss": 0.6093, |
| "mean_token_accuracy": 0.8339162111282349, |
| "num_tokens": 3507871.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.9581120943952803, |
| "grad_norm": 3.1875, |
| "learning_rate": 1.9563679245283018e-06, |
| "loss": 0.513, |
| "mean_token_accuracy": 0.8540987521409988, |
| "num_tokens": 3528268.0, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.9699115044247788, |
| "grad_norm": 3.15625, |
| "learning_rate": 1.9681603773584904e-06, |
| "loss": 0.5089, |
| "mean_token_accuracy": 0.8553141549229621, |
| "num_tokens": 3550114.0, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.9817109144542773, |
| "grad_norm": 3.1875, |
| "learning_rate": 1.979952830188679e-06, |
| "loss": 0.5294, |
| "mean_token_accuracy": 0.8518137603998184, |
| "num_tokens": 3571296.0, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.993510324483776, |
| "grad_norm": 4.0625, |
| "learning_rate": 1.991745283018868e-06, |
| "loss": 0.5451, |
| "mean_token_accuracy": 0.8485597312450409, |
| "num_tokens": 3592739.0, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.0047197640117993, |
| "grad_norm": 3.4375, |
| "learning_rate": 1.9996069182389935e-06, |
| "loss": 0.5335, |
| "mean_token_accuracy": 0.8465279262316855, |
| "num_tokens": 3612037.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.016519174041298, |
| "grad_norm": 3.09375, |
| "learning_rate": 1.998296645702306e-06, |
| "loss": 0.5037, |
| "mean_token_accuracy": 0.8575536519289017, |
| "num_tokens": 3633743.0, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.0283185840707967, |
| "grad_norm": 3.21875, |
| "learning_rate": 1.9969863731656184e-06, |
| "loss": 0.5768, |
| "mean_token_accuracy": 0.8424099072813988, |
| "num_tokens": 3654779.0, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.040117994100295, |
| "grad_norm": 3.34375, |
| "learning_rate": 1.9956761006289308e-06, |
| "loss": 0.5674, |
| "mean_token_accuracy": 0.8452699959278107, |
| "num_tokens": 3675851.0, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.0519174041297936, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.994365828092243e-06, |
| "loss": 0.5293, |
| "mean_token_accuracy": 0.8562508404254914, |
| "num_tokens": 3698066.0, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.063716814159292, |
| "grad_norm": 3.296875, |
| "learning_rate": 1.9930555555555556e-06, |
| "loss": 0.5562, |
| "mean_token_accuracy": 0.8481244757771492, |
| "num_tokens": 3719035.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.0755162241887906, |
| "grad_norm": 3.015625, |
| "learning_rate": 1.991745283018868e-06, |
| "loss": 0.5163, |
| "mean_token_accuracy": 0.8547627568244934, |
| "num_tokens": 3740303.0, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.0873156342182893, |
| "grad_norm": 3.125, |
| "learning_rate": 1.99043501048218e-06, |
| "loss": 0.504, |
| "mean_token_accuracy": 0.8565779209136963, |
| "num_tokens": 3762335.0, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.0991150442477875, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.9891247379454925e-06, |
| "loss": 0.5154, |
| "mean_token_accuracy": 0.85212994068861, |
| "num_tokens": 3783806.0, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.110914454277286, |
| "grad_norm": 3.3125, |
| "learning_rate": 1.987814465408805e-06, |
| "loss": 0.5426, |
| "mean_token_accuracy": 0.849507075548172, |
| "num_tokens": 3804832.0, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.1227138643067844, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.9865041928721173e-06, |
| "loss": 0.5158, |
| "mean_token_accuracy": 0.8580360785126686, |
| "num_tokens": 3826205.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.134513274336283, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.9851939203354297e-06, |
| "loss": 0.5005, |
| "mean_token_accuracy": 0.8555154889822006, |
| "num_tokens": 3846562.0, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.146312684365782, |
| "grad_norm": 3.546875, |
| "learning_rate": 1.983883647798742e-06, |
| "loss": 0.5212, |
| "mean_token_accuracy": 0.8508405730128288, |
| "num_tokens": 3868217.0, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.15811209439528, |
| "grad_norm": 2.9375, |
| "learning_rate": 1.982573375262054e-06, |
| "loss": 0.5339, |
| "mean_token_accuracy": 0.8502182424068451, |
| "num_tokens": 3890241.0, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.1699115044247788, |
| "grad_norm": 3.0625, |
| "learning_rate": 1.9812631027253666e-06, |
| "loss": 0.5404, |
| "mean_token_accuracy": 0.8525576934218406, |
| "num_tokens": 3910950.0, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.1817109144542775, |
| "grad_norm": 3.75, |
| "learning_rate": 1.979952830188679e-06, |
| "loss": 0.5361, |
| "mean_token_accuracy": 0.8487749889492988, |
| "num_tokens": 3932488.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.1935103244837757, |
| "grad_norm": 3.9375, |
| "learning_rate": 1.9786425576519914e-06, |
| "loss": 0.578, |
| "mean_token_accuracy": 0.8436717450618744, |
| "num_tokens": 3954059.0, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.2053097345132744, |
| "grad_norm": 3.125, |
| "learning_rate": 1.977332285115304e-06, |
| "loss": 0.4933, |
| "mean_token_accuracy": 0.8593879714608192, |
| "num_tokens": 3975757.0, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.217109144542773, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.9760220125786163e-06, |
| "loss": 0.5359, |
| "mean_token_accuracy": 0.8517615765333175, |
| "num_tokens": 3997751.0, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.2289085545722713, |
| "grad_norm": 3.0625, |
| "learning_rate": 1.9747117400419287e-06, |
| "loss": 0.4874, |
| "mean_token_accuracy": 0.8592483907938003, |
| "num_tokens": 4018155.0, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.24070796460177, |
| "grad_norm": 2.9375, |
| "learning_rate": 1.973401467505241e-06, |
| "loss": 0.4987, |
| "mean_token_accuracy": 0.8605604246258736, |
| "num_tokens": 4039343.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.2525073746312683, |
| "grad_norm": 3.34375, |
| "learning_rate": 1.9720911949685536e-06, |
| "loss": 0.483, |
| "mean_token_accuracy": 0.8658687263727188, |
| "num_tokens": 4061269.0, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.264306784660767, |
| "grad_norm": 2.984375, |
| "learning_rate": 1.9707809224318656e-06, |
| "loss": 0.5493, |
| "mean_token_accuracy": 0.8477798432111741, |
| "num_tokens": 4082926.0, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.2761061946902656, |
| "grad_norm": 2.71875, |
| "learning_rate": 1.969470649895178e-06, |
| "loss": 0.5093, |
| "mean_token_accuracy": 0.8569615930318832, |
| "num_tokens": 4105412.0, |
| "step": 1930 |
| }, |
| { |
| "epoch": 2.287905604719764, |
| "grad_norm": 3.328125, |
| "learning_rate": 1.9681603773584904e-06, |
| "loss": 0.5172, |
| "mean_token_accuracy": 0.8497282296419144, |
| "num_tokens": 4127161.0, |
| "step": 1940 |
| }, |
| { |
| "epoch": 2.2997050147492626, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.966850104821803e-06, |
| "loss": 0.4735, |
| "mean_token_accuracy": 0.8628137081861496, |
| "num_tokens": 4149328.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.311504424778761, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.9655398322851152e-06, |
| "loss": 0.511, |
| "mean_token_accuracy": 0.8592940300703049, |
| "num_tokens": 4170134.0, |
| "step": 1960 |
| }, |
| { |
| "epoch": 2.3233038348082595, |
| "grad_norm": 3.5, |
| "learning_rate": 1.9642295597484277e-06, |
| "loss": 0.4887, |
| "mean_token_accuracy": 0.8641975656151771, |
| "num_tokens": 4190631.0, |
| "step": 1970 |
| }, |
| { |
| "epoch": 2.335103244837758, |
| "grad_norm": 3.421875, |
| "learning_rate": 1.96291928721174e-06, |
| "loss": 0.5254, |
| "mean_token_accuracy": 0.8557097122073174, |
| "num_tokens": 4212640.0, |
| "step": 1980 |
| }, |
| { |
| "epoch": 2.3469026548672565, |
| "grad_norm": 2.625, |
| "learning_rate": 1.9616090146750525e-06, |
| "loss": 0.5708, |
| "mean_token_accuracy": 0.8456377193331719, |
| "num_tokens": 4233153.0, |
| "step": 1990 |
| }, |
| { |
| "epoch": 2.358702064896755, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.960298742138365e-06, |
| "loss": 0.4959, |
| "mean_token_accuracy": 0.8615874752402306, |
| "num_tokens": 4254745.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.370501474926254, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.958988469601677e-06, |
| "loss": 0.5349, |
| "mean_token_accuracy": 0.8470118582248688, |
| "num_tokens": 4276926.0, |
| "step": 2010 |
| }, |
| { |
| "epoch": 2.382300884955752, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.9576781970649894e-06, |
| "loss": 0.5134, |
| "mean_token_accuracy": 0.8535007134079933, |
| "num_tokens": 4298091.0, |
| "step": 2020 |
| }, |
| { |
| "epoch": 2.394100294985251, |
| "grad_norm": 3.09375, |
| "learning_rate": 1.9563679245283018e-06, |
| "loss": 0.5126, |
| "mean_token_accuracy": 0.8601648762822152, |
| "num_tokens": 4318556.0, |
| "step": 2030 |
| }, |
| { |
| "epoch": 2.4058997050147495, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.955057651991614e-06, |
| "loss": 0.5108, |
| "mean_token_accuracy": 0.8596499621868133, |
| "num_tokens": 4340403.0, |
| "step": 2040 |
| }, |
| { |
| "epoch": 2.4176991150442477, |
| "grad_norm": 2.8125, |
| "learning_rate": 1.9537473794549266e-06, |
| "loss": 0.5247, |
| "mean_token_accuracy": 0.8498525634407997, |
| "num_tokens": 4361497.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 2.4294985250737464, |
| "grad_norm": 3.0625, |
| "learning_rate": 1.9524371069182386e-06, |
| "loss": 0.5451, |
| "mean_token_accuracy": 0.8465063005685807, |
| "num_tokens": 4382571.0, |
| "step": 2060 |
| }, |
| { |
| "epoch": 2.4412979351032447, |
| "grad_norm": 2.8125, |
| "learning_rate": 1.951126834381551e-06, |
| "loss": 0.4942, |
| "mean_token_accuracy": 0.8606890484690666, |
| "num_tokens": 4402569.0, |
| "step": 2070 |
| }, |
| { |
| "epoch": 2.4530973451327434, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.9498165618448635e-06, |
| "loss": 0.5288, |
| "mean_token_accuracy": 0.8457216203212738, |
| "num_tokens": 4422955.0, |
| "step": 2080 |
| }, |
| { |
| "epoch": 2.464896755162242, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.948506289308176e-06, |
| "loss": 0.5273, |
| "mean_token_accuracy": 0.8557865083217621, |
| "num_tokens": 4444361.0, |
| "step": 2090 |
| }, |
| { |
| "epoch": 2.4766961651917403, |
| "grad_norm": 3.265625, |
| "learning_rate": 1.9471960167714883e-06, |
| "loss": 0.5321, |
| "mean_token_accuracy": 0.8513106137514115, |
| "num_tokens": 4466742.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.488495575221239, |
| "grad_norm": 3.171875, |
| "learning_rate": 1.9458857442348007e-06, |
| "loss": 0.5181, |
| "mean_token_accuracy": 0.8511507600545883, |
| "num_tokens": 4487045.0, |
| "step": 2110 |
| }, |
| { |
| "epoch": 2.5002949852507372, |
| "grad_norm": 3.59375, |
| "learning_rate": 1.944575471698113e-06, |
| "loss": 0.552, |
| "mean_token_accuracy": 0.8471585810184479, |
| "num_tokens": 4508358.0, |
| "step": 2120 |
| }, |
| { |
| "epoch": 2.512094395280236, |
| "grad_norm": 3.484375, |
| "learning_rate": 1.9432651991614256e-06, |
| "loss": 0.5553, |
| "mean_token_accuracy": 0.8413275972008705, |
| "num_tokens": 4528423.0, |
| "step": 2130 |
| }, |
| { |
| "epoch": 2.5238938053097346, |
| "grad_norm": 2.984375, |
| "learning_rate": 1.941954926624738e-06, |
| "loss": 0.5033, |
| "mean_token_accuracy": 0.8596482038497925, |
| "num_tokens": 4548803.0, |
| "step": 2140 |
| }, |
| { |
| "epoch": 2.535693215339233, |
| "grad_norm": 4.25, |
| "learning_rate": 1.9406446540880504e-06, |
| "loss": 0.4987, |
| "mean_token_accuracy": 0.8573851436376572, |
| "num_tokens": 4569199.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 2.5474926253687316, |
| "grad_norm": 3.078125, |
| "learning_rate": 1.9393343815513624e-06, |
| "loss": 0.5089, |
| "mean_token_accuracy": 0.8548808857798577, |
| "num_tokens": 4590150.0, |
| "step": 2160 |
| }, |
| { |
| "epoch": 2.5592920353982302, |
| "grad_norm": 3.609375, |
| "learning_rate": 1.938024109014675e-06, |
| "loss": 0.5323, |
| "mean_token_accuracy": 0.8530441164970398, |
| "num_tokens": 4611106.0, |
| "step": 2170 |
| }, |
| { |
| "epoch": 2.5710914454277285, |
| "grad_norm": 2.84375, |
| "learning_rate": 1.9367138364779873e-06, |
| "loss": 0.5124, |
| "mean_token_accuracy": 0.8499145448207855, |
| "num_tokens": 4632367.0, |
| "step": 2180 |
| }, |
| { |
| "epoch": 2.582890855457227, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.9354035639412997e-06, |
| "loss": 0.5598, |
| "mean_token_accuracy": 0.8409982249140739, |
| "num_tokens": 4654737.0, |
| "step": 2190 |
| }, |
| { |
| "epoch": 2.594690265486726, |
| "grad_norm": 3.84375, |
| "learning_rate": 1.934093291404612e-06, |
| "loss": 0.5611, |
| "mean_token_accuracy": 0.8476599439978599, |
| "num_tokens": 4674920.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.606489675516224, |
| "grad_norm": 3.21875, |
| "learning_rate": 1.9327830188679246e-06, |
| "loss": 0.5161, |
| "mean_token_accuracy": 0.8580936983227729, |
| "num_tokens": 4695044.0, |
| "step": 2210 |
| }, |
| { |
| "epoch": 2.618289085545723, |
| "grad_norm": 2.890625, |
| "learning_rate": 1.931472746331237e-06, |
| "loss": 0.5184, |
| "mean_token_accuracy": 0.8504636406898498, |
| "num_tokens": 4715862.0, |
| "step": 2220 |
| }, |
| { |
| "epoch": 2.6300884955752215, |
| "grad_norm": 3.46875, |
| "learning_rate": 1.9301624737945494e-06, |
| "loss": 0.5447, |
| "mean_token_accuracy": 0.8542811453342438, |
| "num_tokens": 4737389.0, |
| "step": 2230 |
| }, |
| { |
| "epoch": 2.6418879056047198, |
| "grad_norm": 2.75, |
| "learning_rate": 1.928852201257862e-06, |
| "loss": 0.5171, |
| "mean_token_accuracy": 0.8554289311170578, |
| "num_tokens": 4758702.0, |
| "step": 2240 |
| }, |
| { |
| "epoch": 2.6536873156342184, |
| "grad_norm": 3.265625, |
| "learning_rate": 1.927541928721174e-06, |
| "loss": 0.4929, |
| "mean_token_accuracy": 0.8608638659119606, |
| "num_tokens": 4780941.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 2.6654867256637167, |
| "grad_norm": 2.921875, |
| "learning_rate": 1.9262316561844862e-06, |
| "loss": 0.5131, |
| "mean_token_accuracy": 0.8563064381480217, |
| "num_tokens": 4803034.0, |
| "step": 2260 |
| }, |
| { |
| "epoch": 2.6772861356932154, |
| "grad_norm": 2.625, |
| "learning_rate": 1.9249213836477987e-06, |
| "loss": 0.5193, |
| "mean_token_accuracy": 0.853195971250534, |
| "num_tokens": 4824370.0, |
| "step": 2270 |
| }, |
| { |
| "epoch": 2.6890855457227136, |
| "grad_norm": 3.234375, |
| "learning_rate": 1.923611111111111e-06, |
| "loss": 0.5097, |
| "mean_token_accuracy": 0.8548060864210129, |
| "num_tokens": 4845290.0, |
| "step": 2280 |
| }, |
| { |
| "epoch": 2.7008849557522123, |
| "grad_norm": 3.078125, |
| "learning_rate": 1.9223008385744235e-06, |
| "loss": 0.4923, |
| "mean_token_accuracy": 0.8582960113883018, |
| "num_tokens": 4867013.0, |
| "step": 2290 |
| }, |
| { |
| "epoch": 2.712684365781711, |
| "grad_norm": 3.125, |
| "learning_rate": 1.9209905660377355e-06, |
| "loss": 0.5169, |
| "mean_token_accuracy": 0.8528640598058701, |
| "num_tokens": 4888572.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.7244837758112093, |
| "grad_norm": 2.625, |
| "learning_rate": 1.919680293501048e-06, |
| "loss": 0.5259, |
| "mean_token_accuracy": 0.8525827392935753, |
| "num_tokens": 4910221.0, |
| "step": 2310 |
| }, |
| { |
| "epoch": 2.736283185840708, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.9183700209643604e-06, |
| "loss": 0.5169, |
| "mean_token_accuracy": 0.8574760437011719, |
| "num_tokens": 4931314.0, |
| "step": 2320 |
| }, |
| { |
| "epoch": 2.7480825958702066, |
| "grad_norm": 3.328125, |
| "learning_rate": 1.9170597484276728e-06, |
| "loss": 0.4867, |
| "mean_token_accuracy": 0.8613032251596451, |
| "num_tokens": 4952479.0, |
| "step": 2330 |
| }, |
| { |
| "epoch": 2.759882005899705, |
| "grad_norm": 2.25, |
| "learning_rate": 1.915749475890985e-06, |
| "loss": 0.4662, |
| "mean_token_accuracy": 0.8695660203695297, |
| "num_tokens": 4972794.0, |
| "step": 2340 |
| }, |
| { |
| "epoch": 2.7716814159292036, |
| "grad_norm": 2.875, |
| "learning_rate": 1.9144392033542976e-06, |
| "loss": 0.4621, |
| "mean_token_accuracy": 0.8688267141580581, |
| "num_tokens": 4994091.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 2.7834808259587023, |
| "grad_norm": 2.75, |
| "learning_rate": 1.91312893081761e-06, |
| "loss": 0.4384, |
| "mean_token_accuracy": 0.8672922357916832, |
| "num_tokens": 5015841.0, |
| "step": 2360 |
| }, |
| { |
| "epoch": 2.7952802359882005, |
| "grad_norm": 2.859375, |
| "learning_rate": 1.9118186582809225e-06, |
| "loss": 0.5288, |
| "mean_token_accuracy": 0.8498433127999305, |
| "num_tokens": 5036665.0, |
| "step": 2370 |
| }, |
| { |
| "epoch": 2.807079646017699, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.910508385744235e-06, |
| "loss": 0.4991, |
| "mean_token_accuracy": 0.858961108326912, |
| "num_tokens": 5058598.0, |
| "step": 2380 |
| }, |
| { |
| "epoch": 2.818879056047198, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.909198113207547e-06, |
| "loss": 0.5173, |
| "mean_token_accuracy": 0.8499269977211952, |
| "num_tokens": 5079618.0, |
| "step": 2390 |
| }, |
| { |
| "epoch": 2.830678466076696, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.9078878406708593e-06, |
| "loss": 0.4847, |
| "mean_token_accuracy": 0.8587676703929901, |
| "num_tokens": 5100756.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.842477876106195, |
| "grad_norm": 3.34375, |
| "learning_rate": 1.9065775681341717e-06, |
| "loss": 0.4972, |
| "mean_token_accuracy": 0.8580302372574806, |
| "num_tokens": 5121687.0, |
| "step": 2410 |
| }, |
| { |
| "epoch": 2.854277286135693, |
| "grad_norm": 3.265625, |
| "learning_rate": 1.9052672955974842e-06, |
| "loss": 0.4991, |
| "mean_token_accuracy": 0.8608438298106194, |
| "num_tokens": 5142552.0, |
| "step": 2420 |
| }, |
| { |
| "epoch": 2.866076696165192, |
| "grad_norm": 3.34375, |
| "learning_rate": 1.9039570230607966e-06, |
| "loss": 0.4645, |
| "mean_token_accuracy": 0.8678088426589966, |
| "num_tokens": 5163635.0, |
| "step": 2430 |
| }, |
| { |
| "epoch": 2.87787610619469, |
| "grad_norm": 2.890625, |
| "learning_rate": 1.9026467505241088e-06, |
| "loss": 0.5442, |
| "mean_token_accuracy": 0.8509184956550598, |
| "num_tokens": 5185095.0, |
| "step": 2440 |
| }, |
| { |
| "epoch": 2.8896755162241887, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.9013364779874212e-06, |
| "loss": 0.6003, |
| "mean_token_accuracy": 0.8469450145959854, |
| "num_tokens": 5206969.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 2.9014749262536874, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.9000262054507337e-06, |
| "loss": 0.5016, |
| "mean_token_accuracy": 0.8594883874058723, |
| "num_tokens": 5229152.0, |
| "step": 2460 |
| }, |
| { |
| "epoch": 2.9132743362831857, |
| "grad_norm": 3.015625, |
| "learning_rate": 1.898715932914046e-06, |
| "loss": 0.5591, |
| "mean_token_accuracy": 0.8536392852663994, |
| "num_tokens": 5250489.0, |
| "step": 2470 |
| }, |
| { |
| "epoch": 2.9250737463126844, |
| "grad_norm": 2.609375, |
| "learning_rate": 1.8974056603773585e-06, |
| "loss": 0.4848, |
| "mean_token_accuracy": 0.8597168505191803, |
| "num_tokens": 5272849.0, |
| "step": 2480 |
| }, |
| { |
| "epoch": 2.936873156342183, |
| "grad_norm": 2.984375, |
| "learning_rate": 1.8960953878406707e-06, |
| "loss": 0.4767, |
| "mean_token_accuracy": 0.8632719159126282, |
| "num_tokens": 5295205.0, |
| "step": 2490 |
| }, |
| { |
| "epoch": 2.9486725663716813, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.8947851153039831e-06, |
| "loss": 0.5417, |
| "mean_token_accuracy": 0.8518997684121132, |
| "num_tokens": 5317453.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.96047197640118, |
| "grad_norm": 3.375, |
| "learning_rate": 1.8934748427672956e-06, |
| "loss": 0.4579, |
| "mean_token_accuracy": 0.8677797138690948, |
| "num_tokens": 5337784.0, |
| "step": 2510 |
| }, |
| { |
| "epoch": 2.9722713864306787, |
| "grad_norm": 4.1875, |
| "learning_rate": 1.892164570230608e-06, |
| "loss": 0.482, |
| "mean_token_accuracy": 0.865347146987915, |
| "num_tokens": 5358018.0, |
| "step": 2520 |
| }, |
| { |
| "epoch": 2.984070796460177, |
| "grad_norm": 3.578125, |
| "learning_rate": 1.8908542976939204e-06, |
| "loss": 0.5301, |
| "mean_token_accuracy": 0.8472265586256981, |
| "num_tokens": 5377989.0, |
| "step": 2530 |
| }, |
| { |
| "epoch": 2.9958702064896756, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.8895440251572326e-06, |
| "loss": 0.5373, |
| "mean_token_accuracy": 0.8547817453742027, |
| "num_tokens": 5398462.0, |
| "step": 2540 |
| }, |
| { |
| "epoch": 3.007079646017699, |
| "grad_norm": 2.46875, |
| "learning_rate": 1.888233752620545e-06, |
| "loss": 0.5372, |
| "mean_token_accuracy": 0.8555425123164528, |
| "num_tokens": 5418196.0, |
| "step": 2550 |
| }, |
| { |
| "epoch": 3.0188790560471976, |
| "grad_norm": 2.9375, |
| "learning_rate": 1.8869234800838575e-06, |
| "loss": 0.5334, |
| "mean_token_accuracy": 0.8561224088072776, |
| "num_tokens": 5439978.0, |
| "step": 2560 |
| }, |
| { |
| "epoch": 3.0306784660766963, |
| "grad_norm": 3.0, |
| "learning_rate": 1.8856132075471699e-06, |
| "loss": 0.515, |
| "mean_token_accuracy": 0.8609302997589111, |
| "num_tokens": 5461507.0, |
| "step": 2570 |
| }, |
| { |
| "epoch": 3.0424778761061946, |
| "grad_norm": 2.96875, |
| "learning_rate": 1.8843029350104819e-06, |
| "loss": 0.4935, |
| "mean_token_accuracy": 0.8608765229582787, |
| "num_tokens": 5481889.0, |
| "step": 2580 |
| }, |
| { |
| "epoch": 3.0542772861356933, |
| "grad_norm": 3.171875, |
| "learning_rate": 1.8829926624737943e-06, |
| "loss": 0.4953, |
| "mean_token_accuracy": 0.8559648513793945, |
| "num_tokens": 5502205.0, |
| "step": 2590 |
| }, |
| { |
| "epoch": 3.066076696165192, |
| "grad_norm": 3.171875, |
| "learning_rate": 1.8816823899371067e-06, |
| "loss": 0.5494, |
| "mean_token_accuracy": 0.8441193103790283, |
| "num_tokens": 5524246.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 3.07787610619469, |
| "grad_norm": 3.296875, |
| "learning_rate": 1.8803721174004192e-06, |
| "loss": 0.4923, |
| "mean_token_accuracy": 0.8591754183173179, |
| "num_tokens": 5545301.0, |
| "step": 2610 |
| }, |
| { |
| "epoch": 3.089675516224189, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.8790618448637316e-06, |
| "loss": 0.4928, |
| "mean_token_accuracy": 0.8619665756821633, |
| "num_tokens": 5568267.0, |
| "step": 2620 |
| }, |
| { |
| "epoch": 3.101474926253687, |
| "grad_norm": 3.1875, |
| "learning_rate": 1.8777515723270438e-06, |
| "loss": 0.501, |
| "mean_token_accuracy": 0.8605804219841957, |
| "num_tokens": 5588928.0, |
| "step": 2630 |
| }, |
| { |
| "epoch": 3.113274336283186, |
| "grad_norm": 3.734375, |
| "learning_rate": 1.8764412997903562e-06, |
| "loss": 0.4818, |
| "mean_token_accuracy": 0.8548641964793205, |
| "num_tokens": 5609590.0, |
| "step": 2640 |
| }, |
| { |
| "epoch": 3.1250737463126845, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.8751310272536686e-06, |
| "loss": 0.4937, |
| "mean_token_accuracy": 0.8586649984121323, |
| "num_tokens": 5631029.0, |
| "step": 2650 |
| }, |
| { |
| "epoch": 3.1368731563421828, |
| "grad_norm": 2.890625, |
| "learning_rate": 1.873820754716981e-06, |
| "loss": 0.5593, |
| "mean_token_accuracy": 0.8506124198436738, |
| "num_tokens": 5652708.0, |
| "step": 2660 |
| }, |
| { |
| "epoch": 3.1486725663716815, |
| "grad_norm": 3.0, |
| "learning_rate": 1.8725104821802935e-06, |
| "loss": 0.481, |
| "mean_token_accuracy": 0.8614728271961212, |
| "num_tokens": 5672587.0, |
| "step": 2670 |
| }, |
| { |
| "epoch": 3.1604719764011797, |
| "grad_norm": 3.984375, |
| "learning_rate": 1.8712002096436057e-06, |
| "loss": 0.5488, |
| "mean_token_accuracy": 0.8480077460408211, |
| "num_tokens": 5694550.0, |
| "step": 2680 |
| }, |
| { |
| "epoch": 3.1722713864306784, |
| "grad_norm": 2.90625, |
| "learning_rate": 1.8698899371069181e-06, |
| "loss": 0.536, |
| "mean_token_accuracy": 0.8506260320544243, |
| "num_tokens": 5716770.0, |
| "step": 2690 |
| }, |
| { |
| "epoch": 3.184070796460177, |
| "grad_norm": 2.578125, |
| "learning_rate": 1.8685796645702305e-06, |
| "loss": 0.5297, |
| "mean_token_accuracy": 0.8521213725209236, |
| "num_tokens": 5739574.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 3.1958702064896753, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.867269392033543e-06, |
| "loss": 0.4858, |
| "mean_token_accuracy": 0.8626686468720436, |
| "num_tokens": 5760890.0, |
| "step": 2710 |
| }, |
| { |
| "epoch": 3.207669616519174, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.8659591194968554e-06, |
| "loss": 0.516, |
| "mean_token_accuracy": 0.8553755402565002, |
| "num_tokens": 5782939.0, |
| "step": 2720 |
| }, |
| { |
| "epoch": 3.2194690265486727, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.8646488469601676e-06, |
| "loss": 0.5144, |
| "mean_token_accuracy": 0.8556733667850495, |
| "num_tokens": 5804411.0, |
| "step": 2730 |
| }, |
| { |
| "epoch": 3.231268436578171, |
| "grad_norm": 3.609375, |
| "learning_rate": 1.86333857442348e-06, |
| "loss": 0.4871, |
| "mean_token_accuracy": 0.8582729786634445, |
| "num_tokens": 5825304.0, |
| "step": 2740 |
| }, |
| { |
| "epoch": 3.2430678466076697, |
| "grad_norm": 2.859375, |
| "learning_rate": 1.8620283018867924e-06, |
| "loss": 0.5685, |
| "mean_token_accuracy": 0.8471158519387245, |
| "num_tokens": 5847062.0, |
| "step": 2750 |
| }, |
| { |
| "epoch": 3.2548672566371684, |
| "grad_norm": 2.4375, |
| "learning_rate": 1.8607180293501049e-06, |
| "loss": 0.4777, |
| "mean_token_accuracy": 0.8659245818853378, |
| "num_tokens": 5867918.0, |
| "step": 2760 |
| }, |
| { |
| "epoch": 3.2666666666666666, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.8594077568134173e-06, |
| "loss": 0.4594, |
| "mean_token_accuracy": 0.8666694790124894, |
| "num_tokens": 5890721.0, |
| "step": 2770 |
| }, |
| { |
| "epoch": 3.2784660766961653, |
| "grad_norm": 2.984375, |
| "learning_rate": 1.8580974842767295e-06, |
| "loss": 0.4515, |
| "mean_token_accuracy": 0.8679191544651985, |
| "num_tokens": 5912707.0, |
| "step": 2780 |
| }, |
| { |
| "epoch": 3.2902654867256635, |
| "grad_norm": 3.0625, |
| "learning_rate": 1.856787211740042e-06, |
| "loss": 0.5313, |
| "mean_token_accuracy": 0.8570784822106361, |
| "num_tokens": 5935619.0, |
| "step": 2790 |
| }, |
| { |
| "epoch": 3.3020648967551622, |
| "grad_norm": 3.109375, |
| "learning_rate": 1.8554769392033541e-06, |
| "loss": 0.5186, |
| "mean_token_accuracy": 0.85502108335495, |
| "num_tokens": 5957631.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 3.313864306784661, |
| "grad_norm": 3.125, |
| "learning_rate": 1.8541666666666666e-06, |
| "loss": 0.5247, |
| "mean_token_accuracy": 0.8490757644176483, |
| "num_tokens": 5978543.0, |
| "step": 2810 |
| }, |
| { |
| "epoch": 3.325663716814159, |
| "grad_norm": 3.25, |
| "learning_rate": 1.8528563941299788e-06, |
| "loss": 0.5311, |
| "mean_token_accuracy": 0.8528130516409874, |
| "num_tokens": 5999394.0, |
| "step": 2820 |
| }, |
| { |
| "epoch": 3.337463126843658, |
| "grad_norm": 3.4375, |
| "learning_rate": 1.8515461215932912e-06, |
| "loss": 0.4583, |
| "mean_token_accuracy": 0.8701539307832717, |
| "num_tokens": 6021294.0, |
| "step": 2830 |
| }, |
| { |
| "epoch": 3.349262536873156, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.8502358490566036e-06, |
| "loss": 0.4641, |
| "mean_token_accuracy": 0.8679096519947052, |
| "num_tokens": 6043359.0, |
| "step": 2840 |
| }, |
| { |
| "epoch": 3.361061946902655, |
| "grad_norm": 2.78125, |
| "learning_rate": 1.848925576519916e-06, |
| "loss": 0.4973, |
| "mean_token_accuracy": 0.8571968704462052, |
| "num_tokens": 6064200.0, |
| "step": 2850 |
| }, |
| { |
| "epoch": 3.3728613569321535, |
| "grad_norm": 3.15625, |
| "learning_rate": 1.8476153039832285e-06, |
| "loss": 0.5243, |
| "mean_token_accuracy": 0.8558759972453117, |
| "num_tokens": 6084471.0, |
| "step": 2860 |
| }, |
| { |
| "epoch": 3.3846607669616517, |
| "grad_norm": 2.765625, |
| "learning_rate": 1.8463050314465407e-06, |
| "loss": 0.4703, |
| "mean_token_accuracy": 0.8605129286646843, |
| "num_tokens": 6105607.0, |
| "step": 2870 |
| }, |
| { |
| "epoch": 3.3964601769911504, |
| "grad_norm": 2.546875, |
| "learning_rate": 1.844994758909853e-06, |
| "loss": 0.5099, |
| "mean_token_accuracy": 0.8579583093523979, |
| "num_tokens": 6126819.0, |
| "step": 2880 |
| }, |
| { |
| "epoch": 3.408259587020649, |
| "grad_norm": 2.140625, |
| "learning_rate": 1.8436844863731655e-06, |
| "loss": 0.4811, |
| "mean_token_accuracy": 0.8615627244114876, |
| "num_tokens": 6147734.0, |
| "step": 2890 |
| }, |
| { |
| "epoch": 3.4200589970501474, |
| "grad_norm": 3.015625, |
| "learning_rate": 1.842374213836478e-06, |
| "loss": 0.4815, |
| "mean_token_accuracy": 0.8623376488685608, |
| "num_tokens": 6169987.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 3.431858407079646, |
| "grad_norm": 2.421875, |
| "learning_rate": 1.8410639412997904e-06, |
| "loss": 0.4735, |
| "mean_token_accuracy": 0.8635832741856575, |
| "num_tokens": 6190461.0, |
| "step": 2910 |
| }, |
| { |
| "epoch": 3.4436578171091448, |
| "grad_norm": 2.828125, |
| "learning_rate": 1.8397536687631026e-06, |
| "loss": 0.4795, |
| "mean_token_accuracy": 0.8627579301595688, |
| "num_tokens": 6210728.0, |
| "step": 2920 |
| }, |
| { |
| "epoch": 3.455457227138643, |
| "grad_norm": 2.5625, |
| "learning_rate": 1.838443396226415e-06, |
| "loss": 0.4704, |
| "mean_token_accuracy": 0.8640405595302582, |
| "num_tokens": 6230558.0, |
| "step": 2930 |
| }, |
| { |
| "epoch": 3.4672566371681417, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.8371331236897274e-06, |
| "loss": 0.5039, |
| "mean_token_accuracy": 0.8607368037104607, |
| "num_tokens": 6252126.0, |
| "step": 2940 |
| }, |
| { |
| "epoch": 3.47905604719764, |
| "grad_norm": 3.1875, |
| "learning_rate": 1.8358228511530399e-06, |
| "loss": 0.554, |
| "mean_token_accuracy": 0.8453941598534584, |
| "num_tokens": 6273355.0, |
| "step": 2950 |
| }, |
| { |
| "epoch": 3.4908554572271386, |
| "grad_norm": 3.0625, |
| "learning_rate": 1.8345125786163523e-06, |
| "loss": 0.5255, |
| "mean_token_accuracy": 0.8594121977686882, |
| "num_tokens": 6295512.0, |
| "step": 2960 |
| }, |
| { |
| "epoch": 3.5026548672566373, |
| "grad_norm": 2.71875, |
| "learning_rate": 1.8332023060796645e-06, |
| "loss": 0.5135, |
| "mean_token_accuracy": 0.8592988654971123, |
| "num_tokens": 6317517.0, |
| "step": 2970 |
| }, |
| { |
| "epoch": 3.5144542772861356, |
| "grad_norm": 2.625, |
| "learning_rate": 1.831892033542977e-06, |
| "loss": 0.4596, |
| "mean_token_accuracy": 0.8651928603649139, |
| "num_tokens": 6338348.0, |
| "step": 2980 |
| }, |
| { |
| "epoch": 3.5262536873156343, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.8305817610062893e-06, |
| "loss": 0.4974, |
| "mean_token_accuracy": 0.860434214770794, |
| "num_tokens": 6358776.0, |
| "step": 2990 |
| }, |
| { |
| "epoch": 3.5380530973451325, |
| "grad_norm": 2.859375, |
| "learning_rate": 1.8292714884696018e-06, |
| "loss": 0.4592, |
| "mean_token_accuracy": 0.8664771750569343, |
| "num_tokens": 6380603.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 3.549852507374631, |
| "grad_norm": 3.296875, |
| "learning_rate": 1.8279612159329138e-06, |
| "loss": 0.5006, |
| "mean_token_accuracy": 0.8568850561976433, |
| "num_tokens": 6401706.0, |
| "step": 3010 |
| }, |
| { |
| "epoch": 3.56165191740413, |
| "grad_norm": 3.171875, |
| "learning_rate": 1.8266509433962262e-06, |
| "loss": 0.5257, |
| "mean_token_accuracy": 0.8553405821323394, |
| "num_tokens": 6422804.0, |
| "step": 3020 |
| }, |
| { |
| "epoch": 3.573451327433628, |
| "grad_norm": 2.71875, |
| "learning_rate": 1.8253406708595386e-06, |
| "loss": 0.5256, |
| "mean_token_accuracy": 0.8555272459983826, |
| "num_tokens": 6443855.0, |
| "step": 3030 |
| }, |
| { |
| "epoch": 3.585250737463127, |
| "grad_norm": 3.84375, |
| "learning_rate": 1.824030398322851e-06, |
| "loss": 0.5382, |
| "mean_token_accuracy": 0.8499476745724678, |
| "num_tokens": 6464819.0, |
| "step": 3040 |
| }, |
| { |
| "epoch": 3.5970501474926255, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.8227201257861634e-06, |
| "loss": 0.5243, |
| "mean_token_accuracy": 0.8548250257968902, |
| "num_tokens": 6486536.0, |
| "step": 3050 |
| }, |
| { |
| "epoch": 3.6088495575221238, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.8214098532494757e-06, |
| "loss": 0.4919, |
| "mean_token_accuracy": 0.8628597438335419, |
| "num_tokens": 6508047.0, |
| "step": 3060 |
| }, |
| { |
| "epoch": 3.6206489675516225, |
| "grad_norm": 3.5, |
| "learning_rate": 1.820099580712788e-06, |
| "loss": 0.4725, |
| "mean_token_accuracy": 0.8650040581822396, |
| "num_tokens": 6528733.0, |
| "step": 3070 |
| }, |
| { |
| "epoch": 3.632448377581121, |
| "grad_norm": 2.765625, |
| "learning_rate": 1.8187893081761005e-06, |
| "loss": 0.5071, |
| "mean_token_accuracy": 0.8603786826133728, |
| "num_tokens": 6549560.0, |
| "step": 3080 |
| }, |
| { |
| "epoch": 3.6442477876106194, |
| "grad_norm": 3.234375, |
| "learning_rate": 1.817479035639413e-06, |
| "loss": 0.5054, |
| "mean_token_accuracy": 0.8500581681728363, |
| "num_tokens": 6570485.0, |
| "step": 3090 |
| }, |
| { |
| "epoch": 3.656047197640118, |
| "grad_norm": 2.90625, |
| "learning_rate": 1.8161687631027254e-06, |
| "loss": 0.5119, |
| "mean_token_accuracy": 0.8553215324878692, |
| "num_tokens": 6592321.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 3.667846607669617, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.8148584905660376e-06, |
| "loss": 0.4523, |
| "mean_token_accuracy": 0.868057268857956, |
| "num_tokens": 6613787.0, |
| "step": 3110 |
| }, |
| { |
| "epoch": 3.679646017699115, |
| "grad_norm": 2.796875, |
| "learning_rate": 1.81354821802935e-06, |
| "loss": 0.5279, |
| "mean_token_accuracy": 0.8529254078865052, |
| "num_tokens": 6634300.0, |
| "step": 3120 |
| }, |
| { |
| "epoch": 3.6914454277286137, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.8122379454926624e-06, |
| "loss": 0.4776, |
| "mean_token_accuracy": 0.8583889678120613, |
| "num_tokens": 6655606.0, |
| "step": 3130 |
| }, |
| { |
| "epoch": 3.703244837758112, |
| "grad_norm": 3.25, |
| "learning_rate": 1.8109276729559748e-06, |
| "loss": 0.5314, |
| "mean_token_accuracy": 0.8478195607662201, |
| "num_tokens": 6676662.0, |
| "step": 3140 |
| }, |
| { |
| "epoch": 3.7150442477876107, |
| "grad_norm": 3.640625, |
| "learning_rate": 1.8096174004192873e-06, |
| "loss": 0.5195, |
| "mean_token_accuracy": 0.8531151413917542, |
| "num_tokens": 6696714.0, |
| "step": 3150 |
| }, |
| { |
| "epoch": 3.726843657817109, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.8083071278825995e-06, |
| "loss": 0.4897, |
| "mean_token_accuracy": 0.8544636845588685, |
| "num_tokens": 6716357.0, |
| "step": 3160 |
| }, |
| { |
| "epoch": 3.7386430678466076, |
| "grad_norm": 2.515625, |
| "learning_rate": 1.8069968553459119e-06, |
| "loss": 0.524, |
| "mean_token_accuracy": 0.8544845834374428, |
| "num_tokens": 6738177.0, |
| "step": 3170 |
| }, |
| { |
| "epoch": 3.7504424778761063, |
| "grad_norm": 3.234375, |
| "learning_rate": 1.8056865828092243e-06, |
| "loss": 0.5005, |
| "mean_token_accuracy": 0.8554182901978493, |
| "num_tokens": 6759235.0, |
| "step": 3180 |
| }, |
| { |
| "epoch": 3.7622418879056045, |
| "grad_norm": 2.9375, |
| "learning_rate": 1.8043763102725367e-06, |
| "loss": 0.5147, |
| "mean_token_accuracy": 0.8651719704270363, |
| "num_tokens": 6781008.0, |
| "step": 3190 |
| }, |
| { |
| "epoch": 3.7740412979351032, |
| "grad_norm": 2.828125, |
| "learning_rate": 1.803066037735849e-06, |
| "loss": 0.4655, |
| "mean_token_accuracy": 0.8662549510598183, |
| "num_tokens": 6801945.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 3.785840707964602, |
| "grad_norm": 2.859375, |
| "learning_rate": 1.8017557651991614e-06, |
| "loss": 0.5046, |
| "mean_token_accuracy": 0.8528249442577363, |
| "num_tokens": 6821713.0, |
| "step": 3210 |
| }, |
| { |
| "epoch": 3.7976401179941, |
| "grad_norm": 2.96875, |
| "learning_rate": 1.8004454926624738e-06, |
| "loss": 0.5062, |
| "mean_token_accuracy": 0.8547831267118454, |
| "num_tokens": 6841980.0, |
| "step": 3220 |
| }, |
| { |
| "epoch": 3.809439528023599, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.799135220125786e-06, |
| "loss": 0.4729, |
| "mean_token_accuracy": 0.8685174465179444, |
| "num_tokens": 6863536.0, |
| "step": 3230 |
| }, |
| { |
| "epoch": 3.8212389380530976, |
| "grad_norm": 3.265625, |
| "learning_rate": 1.7978249475890984e-06, |
| "loss": 0.4654, |
| "mean_token_accuracy": 0.8676441088318825, |
| "num_tokens": 6884973.0, |
| "step": 3240 |
| }, |
| { |
| "epoch": 3.833038348082596, |
| "grad_norm": 2.984375, |
| "learning_rate": 1.7965146750524106e-06, |
| "loss": 0.4723, |
| "mean_token_accuracy": 0.8614483773708344, |
| "num_tokens": 6906563.0, |
| "step": 3250 |
| }, |
| { |
| "epoch": 3.8448377581120945, |
| "grad_norm": 3.71875, |
| "learning_rate": 1.795204402515723e-06, |
| "loss": 0.5365, |
| "mean_token_accuracy": 0.8562477350234985, |
| "num_tokens": 6928970.0, |
| "step": 3260 |
| }, |
| { |
| "epoch": 3.856637168141593, |
| "grad_norm": 3.359375, |
| "learning_rate": 1.7938941299790355e-06, |
| "loss": 0.4936, |
| "mean_token_accuracy": 0.8592018365859986, |
| "num_tokens": 6949293.0, |
| "step": 3270 |
| }, |
| { |
| "epoch": 3.8684365781710914, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.792583857442348e-06, |
| "loss": 0.5129, |
| "mean_token_accuracy": 0.8567561507225037, |
| "num_tokens": 6970131.0, |
| "step": 3280 |
| }, |
| { |
| "epoch": 3.88023598820059, |
| "grad_norm": 2.765625, |
| "learning_rate": 1.7912735849056603e-06, |
| "loss": 0.5037, |
| "mean_token_accuracy": 0.857287335395813, |
| "num_tokens": 6991347.0, |
| "step": 3290 |
| }, |
| { |
| "epoch": 3.8920353982300884, |
| "grad_norm": 3.328125, |
| "learning_rate": 1.7899633123689725e-06, |
| "loss": 0.5377, |
| "mean_token_accuracy": 0.8577619284391403, |
| "num_tokens": 7014336.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 3.903834808259587, |
| "grad_norm": 3.59375, |
| "learning_rate": 1.788653039832285e-06, |
| "loss": 0.539, |
| "mean_token_accuracy": 0.8492809280753135, |
| "num_tokens": 7036198.0, |
| "step": 3310 |
| }, |
| { |
| "epoch": 3.9156342182890853, |
| "grad_norm": 3.078125, |
| "learning_rate": 1.7873427672955974e-06, |
| "loss": 0.5577, |
| "mean_token_accuracy": 0.8459936797618866, |
| "num_tokens": 7056745.0, |
| "step": 3320 |
| }, |
| { |
| "epoch": 3.927433628318584, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.7860324947589098e-06, |
| "loss": 0.4863, |
| "mean_token_accuracy": 0.8669529646635056, |
| "num_tokens": 7079119.0, |
| "step": 3330 |
| }, |
| { |
| "epoch": 3.9392330383480827, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.7847222222222222e-06, |
| "loss": 0.5165, |
| "mean_token_accuracy": 0.857589852809906, |
| "num_tokens": 7101240.0, |
| "step": 3340 |
| }, |
| { |
| "epoch": 3.951032448377581, |
| "grad_norm": 3.25, |
| "learning_rate": 1.7834119496855345e-06, |
| "loss": 0.4974, |
| "mean_token_accuracy": 0.8588278472423554, |
| "num_tokens": 7122968.0, |
| "step": 3350 |
| }, |
| { |
| "epoch": 3.9628318584070796, |
| "grad_norm": 3.21875, |
| "learning_rate": 1.7821016771488469e-06, |
| "loss": 0.4827, |
| "mean_token_accuracy": 0.8636017799377441, |
| "num_tokens": 7142665.0, |
| "step": 3360 |
| }, |
| { |
| "epoch": 3.9746312684365783, |
| "grad_norm": 2.96875, |
| "learning_rate": 1.7807914046121593e-06, |
| "loss": 0.476, |
| "mean_token_accuracy": 0.8629874095320702, |
| "num_tokens": 7163748.0, |
| "step": 3370 |
| }, |
| { |
| "epoch": 3.9864306784660766, |
| "grad_norm": 3.0625, |
| "learning_rate": 1.7794811320754717e-06, |
| "loss": 0.507, |
| "mean_token_accuracy": 0.8541356608271599, |
| "num_tokens": 7184897.0, |
| "step": 3380 |
| }, |
| { |
| "epoch": 3.9982300884955753, |
| "grad_norm": 2.859375, |
| "learning_rate": 1.7781708595387841e-06, |
| "loss": 0.4516, |
| "mean_token_accuracy": 0.8715907648205757, |
| "num_tokens": 7204758.0, |
| "step": 3390 |
| }, |
| { |
| "epoch": 4.009439528023599, |
| "grad_norm": 2.40625, |
| "learning_rate": 1.7768605870020964e-06, |
| "loss": 0.4856, |
| "mean_token_accuracy": 0.861862672002692, |
| "num_tokens": 7225362.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 4.021238938053098, |
| "grad_norm": 2.890625, |
| "learning_rate": 1.7755503144654088e-06, |
| "loss": 0.5504, |
| "mean_token_accuracy": 0.8519014567136765, |
| "num_tokens": 7246630.0, |
| "step": 3410 |
| }, |
| { |
| "epoch": 4.033038348082596, |
| "grad_norm": 3.671875, |
| "learning_rate": 1.7742400419287212e-06, |
| "loss": 0.5143, |
| "mean_token_accuracy": 0.8614159673452377, |
| "num_tokens": 7267968.0, |
| "step": 3420 |
| }, |
| { |
| "epoch": 4.044837758112094, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.7729297693920336e-06, |
| "loss": 0.5069, |
| "mean_token_accuracy": 0.8614302575588226, |
| "num_tokens": 7292124.0, |
| "step": 3430 |
| }, |
| { |
| "epoch": 4.056637168141593, |
| "grad_norm": 3.796875, |
| "learning_rate": 1.7716194968553456e-06, |
| "loss": 0.523, |
| "mean_token_accuracy": 0.8529177412390709, |
| "num_tokens": 7314054.0, |
| "step": 3440 |
| }, |
| { |
| "epoch": 4.068436578171092, |
| "grad_norm": 2.234375, |
| "learning_rate": 1.770309224318658e-06, |
| "loss": 0.5389, |
| "mean_token_accuracy": 0.8498711720108986, |
| "num_tokens": 7335683.0, |
| "step": 3450 |
| }, |
| { |
| "epoch": 4.08023598820059, |
| "grad_norm": 2.796875, |
| "learning_rate": 1.7689989517819705e-06, |
| "loss": 0.5063, |
| "mean_token_accuracy": 0.8558592557907104, |
| "num_tokens": 7357337.0, |
| "step": 3460 |
| }, |
| { |
| "epoch": 4.092035398230088, |
| "grad_norm": 2.8125, |
| "learning_rate": 1.767688679245283e-06, |
| "loss": 0.5013, |
| "mean_token_accuracy": 0.8538054883480072, |
| "num_tokens": 7378189.0, |
| "step": 3470 |
| }, |
| { |
| "epoch": 4.103834808259587, |
| "grad_norm": 2.359375, |
| "learning_rate": 1.7663784067085953e-06, |
| "loss": 0.5154, |
| "mean_token_accuracy": 0.8593904256820679, |
| "num_tokens": 7399662.0, |
| "step": 3480 |
| }, |
| { |
| "epoch": 4.1156342182890855, |
| "grad_norm": 3.25, |
| "learning_rate": 1.7650681341719075e-06, |
| "loss": 0.5085, |
| "mean_token_accuracy": 0.8571495711803436, |
| "num_tokens": 7421650.0, |
| "step": 3490 |
| }, |
| { |
| "epoch": 4.127433628318584, |
| "grad_norm": 3.0625, |
| "learning_rate": 1.76375786163522e-06, |
| "loss": 0.4956, |
| "mean_token_accuracy": 0.8594403684139251, |
| "num_tokens": 7442992.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 4.139233038348083, |
| "grad_norm": 3.5, |
| "learning_rate": 1.7624475890985324e-06, |
| "loss": 0.4995, |
| "mean_token_accuracy": 0.8558610767126084, |
| "num_tokens": 7464176.0, |
| "step": 3510 |
| }, |
| { |
| "epoch": 4.151032448377581, |
| "grad_norm": 2.8125, |
| "learning_rate": 1.7611373165618448e-06, |
| "loss": 0.557, |
| "mean_token_accuracy": 0.8490182489156723, |
| "num_tokens": 7484602.0, |
| "step": 3520 |
| }, |
| { |
| "epoch": 4.162831858407079, |
| "grad_norm": 3.28125, |
| "learning_rate": 1.7598270440251572e-06, |
| "loss": 0.5158, |
| "mean_token_accuracy": 0.8550442889332771, |
| "num_tokens": 7505296.0, |
| "step": 3530 |
| }, |
| { |
| "epoch": 4.1746312684365785, |
| "grad_norm": 3.375, |
| "learning_rate": 1.7585167714884694e-06, |
| "loss": 0.4753, |
| "mean_token_accuracy": 0.8652427449822426, |
| "num_tokens": 7526305.0, |
| "step": 3540 |
| }, |
| { |
| "epoch": 4.186430678466077, |
| "grad_norm": 3.125, |
| "learning_rate": 1.7572064989517819e-06, |
| "loss": 0.5086, |
| "mean_token_accuracy": 0.8554024577140809, |
| "num_tokens": 7547126.0, |
| "step": 3550 |
| }, |
| { |
| "epoch": 4.198230088495575, |
| "grad_norm": 3.90625, |
| "learning_rate": 1.7558962264150943e-06, |
| "loss": 0.5482, |
| "mean_token_accuracy": 0.8472519546747208, |
| "num_tokens": 7569491.0, |
| "step": 3560 |
| }, |
| { |
| "epoch": 4.210029498525074, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.7545859538784067e-06, |
| "loss": 0.5071, |
| "mean_token_accuracy": 0.8492194801568985, |
| "num_tokens": 7591071.0, |
| "step": 3570 |
| }, |
| { |
| "epoch": 4.221828908554572, |
| "grad_norm": 3.515625, |
| "learning_rate": 1.7532756813417191e-06, |
| "loss": 0.5457, |
| "mean_token_accuracy": 0.8525054410099984, |
| "num_tokens": 7612524.0, |
| "step": 3580 |
| }, |
| { |
| "epoch": 4.233628318584071, |
| "grad_norm": 3.28125, |
| "learning_rate": 1.7519654088050313e-06, |
| "loss": 0.5263, |
| "mean_token_accuracy": 0.8528582498431205, |
| "num_tokens": 7633351.0, |
| "step": 3590 |
| }, |
| { |
| "epoch": 4.245427728613569, |
| "grad_norm": 2.296875, |
| "learning_rate": 1.7506551362683438e-06, |
| "loss": 0.4592, |
| "mean_token_accuracy": 0.8690110236406327, |
| "num_tokens": 7654676.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 4.257227138643068, |
| "grad_norm": 3.375, |
| "learning_rate": 1.7493448637316562e-06, |
| "loss": 0.515, |
| "mean_token_accuracy": 0.8553324103355407, |
| "num_tokens": 7675721.0, |
| "step": 3610 |
| }, |
| { |
| "epoch": 4.269026548672566, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.7480345911949686e-06, |
| "loss": 0.4982, |
| "mean_token_accuracy": 0.8597420081496239, |
| "num_tokens": 7698310.0, |
| "step": 3620 |
| }, |
| { |
| "epoch": 4.2808259587020645, |
| "grad_norm": 3.25, |
| "learning_rate": 1.7467243186582808e-06, |
| "loss": 0.478, |
| "mean_token_accuracy": 0.8636646822094918, |
| "num_tokens": 7719625.0, |
| "step": 3630 |
| }, |
| { |
| "epoch": 4.292625368731564, |
| "grad_norm": 3.03125, |
| "learning_rate": 1.7454140461215932e-06, |
| "loss": 0.4948, |
| "mean_token_accuracy": 0.8621580928564072, |
| "num_tokens": 7740185.0, |
| "step": 3640 |
| }, |
| { |
| "epoch": 4.304424778761062, |
| "grad_norm": 2.9375, |
| "learning_rate": 1.7441037735849057e-06, |
| "loss": 0.4525, |
| "mean_token_accuracy": 0.8703215345740318, |
| "num_tokens": 7761009.0, |
| "step": 3650 |
| }, |
| { |
| "epoch": 4.31622418879056, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.742793501048218e-06, |
| "loss": 0.4958, |
| "mean_token_accuracy": 0.8592533960938453, |
| "num_tokens": 7782186.0, |
| "step": 3660 |
| }, |
| { |
| "epoch": 4.328023598820059, |
| "grad_norm": 2.34375, |
| "learning_rate": 1.7414832285115303e-06, |
| "loss": 0.472, |
| "mean_token_accuracy": 0.8654198557138443, |
| "num_tokens": 7802810.0, |
| "step": 3670 |
| }, |
| { |
| "epoch": 4.3398230088495575, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.7401729559748425e-06, |
| "loss": 0.4721, |
| "mean_token_accuracy": 0.8635499522089958, |
| "num_tokens": 7823077.0, |
| "step": 3680 |
| }, |
| { |
| "epoch": 4.351622418879056, |
| "grad_norm": 3.28125, |
| "learning_rate": 1.738862683438155e-06, |
| "loss": 0.4894, |
| "mean_token_accuracy": 0.8595067381858825, |
| "num_tokens": 7842952.0, |
| "step": 3690 |
| }, |
| { |
| "epoch": 4.363421828908555, |
| "grad_norm": 2.53125, |
| "learning_rate": 1.7375524109014674e-06, |
| "loss": 0.5084, |
| "mean_token_accuracy": 0.8585742250084877, |
| "num_tokens": 7864092.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 4.375221238938053, |
| "grad_norm": 3.265625, |
| "learning_rate": 1.7362421383647798e-06, |
| "loss": 0.53, |
| "mean_token_accuracy": 0.8504990801215172, |
| "num_tokens": 7884701.0, |
| "step": 3710 |
| }, |
| { |
| "epoch": 4.387020648967551, |
| "grad_norm": 3.734375, |
| "learning_rate": 1.7349318658280922e-06, |
| "loss": 0.4847, |
| "mean_token_accuracy": 0.861487266421318, |
| "num_tokens": 7906268.0, |
| "step": 3720 |
| }, |
| { |
| "epoch": 4.3988200589970505, |
| "grad_norm": 2.59375, |
| "learning_rate": 1.7336215932914044e-06, |
| "loss": 0.5308, |
| "mean_token_accuracy": 0.8516640156507492, |
| "num_tokens": 7928374.0, |
| "step": 3730 |
| }, |
| { |
| "epoch": 4.410619469026549, |
| "grad_norm": 2.640625, |
| "learning_rate": 1.7323113207547168e-06, |
| "loss": 0.4894, |
| "mean_token_accuracy": 0.8628440231084824, |
| "num_tokens": 7949261.0, |
| "step": 3740 |
| }, |
| { |
| "epoch": 4.422418879056047, |
| "grad_norm": 2.84375, |
| "learning_rate": 1.7310010482180293e-06, |
| "loss": 0.5159, |
| "mean_token_accuracy": 0.8537062495946884, |
| "num_tokens": 7970961.0, |
| "step": 3750 |
| }, |
| { |
| "epoch": 4.434218289085546, |
| "grad_norm": 3.140625, |
| "learning_rate": 1.7296907756813417e-06, |
| "loss": 0.492, |
| "mean_token_accuracy": 0.8576692581176758, |
| "num_tokens": 7991810.0, |
| "step": 3760 |
| }, |
| { |
| "epoch": 4.446017699115044, |
| "grad_norm": 3.046875, |
| "learning_rate": 1.7283805031446541e-06, |
| "loss": 0.4899, |
| "mean_token_accuracy": 0.8665280848741531, |
| "num_tokens": 8013818.0, |
| "step": 3770 |
| }, |
| { |
| "epoch": 4.457817109144543, |
| "grad_norm": 2.6875, |
| "learning_rate": 1.7270702306079663e-06, |
| "loss": 0.4778, |
| "mean_token_accuracy": 0.8673057004809379, |
| "num_tokens": 8034666.0, |
| "step": 3780 |
| }, |
| { |
| "epoch": 4.469616519174041, |
| "grad_norm": 2.859375, |
| "learning_rate": 1.7257599580712787e-06, |
| "loss": 0.4822, |
| "mean_token_accuracy": 0.8649532437324524, |
| "num_tokens": 8055207.0, |
| "step": 3790 |
| }, |
| { |
| "epoch": 4.48141592920354, |
| "grad_norm": 2.484375, |
| "learning_rate": 1.7244496855345912e-06, |
| "loss": 0.4995, |
| "mean_token_accuracy": 0.8592880621552468, |
| "num_tokens": 8076551.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 4.493215339233038, |
| "grad_norm": 2.96875, |
| "learning_rate": 1.7231394129979036e-06, |
| "loss": 0.476, |
| "mean_token_accuracy": 0.8607696041464805, |
| "num_tokens": 8098089.0, |
| "step": 3810 |
| }, |
| { |
| "epoch": 4.5050147492625365, |
| "grad_norm": 3.09375, |
| "learning_rate": 1.7218291404612158e-06, |
| "loss": 0.5019, |
| "mean_token_accuracy": 0.8576576009392738, |
| "num_tokens": 8118583.0, |
| "step": 3820 |
| }, |
| { |
| "epoch": 4.516814159292036, |
| "grad_norm": 3.25, |
| "learning_rate": 1.7205188679245282e-06, |
| "loss": 0.5447, |
| "mean_token_accuracy": 0.8493610918521881, |
| "num_tokens": 8139856.0, |
| "step": 3830 |
| }, |
| { |
| "epoch": 4.528613569321534, |
| "grad_norm": 3.34375, |
| "learning_rate": 1.7192085953878406e-06, |
| "loss": 0.4603, |
| "mean_token_accuracy": 0.8670093446969986, |
| "num_tokens": 8160023.0, |
| "step": 3840 |
| }, |
| { |
| "epoch": 4.540412979351032, |
| "grad_norm": 3.5, |
| "learning_rate": 1.717898322851153e-06, |
| "loss": 0.476, |
| "mean_token_accuracy": 0.8638666957616806, |
| "num_tokens": 8180835.0, |
| "step": 3850 |
| }, |
| { |
| "epoch": 4.552212389380531, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.7165880503144655e-06, |
| "loss": 0.4618, |
| "mean_token_accuracy": 0.8650890454649925, |
| "num_tokens": 8203098.0, |
| "step": 3860 |
| }, |
| { |
| "epoch": 4.5640117994100295, |
| "grad_norm": 3.609375, |
| "learning_rate": 1.7152777777777775e-06, |
| "loss": 0.5672, |
| "mean_token_accuracy": 0.8504336610436439, |
| "num_tokens": 8224557.0, |
| "step": 3870 |
| }, |
| { |
| "epoch": 4.575811209439528, |
| "grad_norm": 3.375, |
| "learning_rate": 1.71396750524109e-06, |
| "loss": 0.485, |
| "mean_token_accuracy": 0.8632472082972527, |
| "num_tokens": 8246168.0, |
| "step": 3880 |
| }, |
| { |
| "epoch": 4.587610619469027, |
| "grad_norm": 2.78125, |
| "learning_rate": 1.7126572327044023e-06, |
| "loss": 0.4919, |
| "mean_token_accuracy": 0.8595013156533241, |
| "num_tokens": 8267661.0, |
| "step": 3890 |
| }, |
| { |
| "epoch": 4.599410029498525, |
| "grad_norm": 3.5, |
| "learning_rate": 1.7113469601677148e-06, |
| "loss": 0.5396, |
| "mean_token_accuracy": 0.8498777762055397, |
| "num_tokens": 8288859.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 4.611209439528023, |
| "grad_norm": 2.75, |
| "learning_rate": 1.7100366876310272e-06, |
| "loss": 0.5269, |
| "mean_token_accuracy": 0.8533955857157707, |
| "num_tokens": 8310460.0, |
| "step": 3910 |
| }, |
| { |
| "epoch": 4.623008849557522, |
| "grad_norm": 2.875, |
| "learning_rate": 1.7087264150943394e-06, |
| "loss": 0.4904, |
| "mean_token_accuracy": 0.8650319501757622, |
| "num_tokens": 8332408.0, |
| "step": 3920 |
| }, |
| { |
| "epoch": 4.634808259587021, |
| "grad_norm": 2.90625, |
| "learning_rate": 1.7074161425576518e-06, |
| "loss": 0.456, |
| "mean_token_accuracy": 0.8670095428824425, |
| "num_tokens": 8353805.0, |
| "step": 3930 |
| }, |
| { |
| "epoch": 4.646607669616519, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.7061058700209642e-06, |
| "loss": 0.4988, |
| "mean_token_accuracy": 0.8611832678318023, |
| "num_tokens": 8375992.0, |
| "step": 3940 |
| }, |
| { |
| "epoch": 4.658407079646018, |
| "grad_norm": 3.359375, |
| "learning_rate": 1.7047955974842767e-06, |
| "loss": 0.5219, |
| "mean_token_accuracy": 0.851292310655117, |
| "num_tokens": 8397073.0, |
| "step": 3950 |
| }, |
| { |
| "epoch": 4.670206489675516, |
| "grad_norm": 2.0625, |
| "learning_rate": 1.703485324947589e-06, |
| "loss": 0.4742, |
| "mean_token_accuracy": 0.8624649658799172, |
| "num_tokens": 8420091.0, |
| "step": 3960 |
| }, |
| { |
| "epoch": 4.682005899705015, |
| "grad_norm": 2.625, |
| "learning_rate": 1.7021750524109013e-06, |
| "loss": 0.5047, |
| "mean_token_accuracy": 0.8583678498864173, |
| "num_tokens": 8442036.0, |
| "step": 3970 |
| }, |
| { |
| "epoch": 4.693805309734513, |
| "grad_norm": 4.15625, |
| "learning_rate": 1.7008647798742137e-06, |
| "loss": 0.476, |
| "mean_token_accuracy": 0.8638784185051918, |
| "num_tokens": 8462999.0, |
| "step": 3980 |
| }, |
| { |
| "epoch": 4.705604719764012, |
| "grad_norm": 5.15625, |
| "learning_rate": 1.6995545073375262e-06, |
| "loss": 0.5316, |
| "mean_token_accuracy": 0.8543857142329216, |
| "num_tokens": 8483628.0, |
| "step": 3990 |
| }, |
| { |
| "epoch": 4.71740412979351, |
| "grad_norm": 2.796875, |
| "learning_rate": 1.6982442348008386e-06, |
| "loss": 0.4946, |
| "mean_token_accuracy": 0.861135421693325, |
| "num_tokens": 8507043.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 4.729203539823009, |
| "grad_norm": 3.34375, |
| "learning_rate": 1.696933962264151e-06, |
| "loss": 0.4821, |
| "mean_token_accuracy": 0.8621435165405273, |
| "num_tokens": 8529905.0, |
| "step": 4010 |
| }, |
| { |
| "epoch": 4.741002949852508, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.6956236897274632e-06, |
| "loss": 0.502, |
| "mean_token_accuracy": 0.8593701273202896, |
| "num_tokens": 8550481.0, |
| "step": 4020 |
| }, |
| { |
| "epoch": 4.752802359882006, |
| "grad_norm": 2.65625, |
| "learning_rate": 1.6943134171907756e-06, |
| "loss": 0.5313, |
| "mean_token_accuracy": 0.8543563172221184, |
| "num_tokens": 8571835.0, |
| "step": 4030 |
| }, |
| { |
| "epoch": 4.764601769911504, |
| "grad_norm": 3.5625, |
| "learning_rate": 1.693003144654088e-06, |
| "loss": 0.5338, |
| "mean_token_accuracy": 0.8556608721613884, |
| "num_tokens": 8592828.0, |
| "step": 4040 |
| }, |
| { |
| "epoch": 4.776401179941003, |
| "grad_norm": 2.859375, |
| "learning_rate": 1.6916928721174005e-06, |
| "loss": 0.4917, |
| "mean_token_accuracy": 0.8639015153050422, |
| "num_tokens": 8615452.0, |
| "step": 4050 |
| }, |
| { |
| "epoch": 4.788200589970502, |
| "grad_norm": 3.375, |
| "learning_rate": 1.6903825995807127e-06, |
| "loss": 0.5134, |
| "mean_token_accuracy": 0.8572206899523735, |
| "num_tokens": 8635810.0, |
| "step": 4060 |
| }, |
| { |
| "epoch": 4.8, |
| "grad_norm": 2.796875, |
| "learning_rate": 1.6890723270440251e-06, |
| "loss": 0.4731, |
| "mean_token_accuracy": 0.8653491243720055, |
| "num_tokens": 8656932.0, |
| "step": 4070 |
| }, |
| { |
| "epoch": 4.811799410029499, |
| "grad_norm": 2.84375, |
| "learning_rate": 1.6877620545073375e-06, |
| "loss": 0.4631, |
| "mean_token_accuracy": 0.864352998137474, |
| "num_tokens": 8678576.0, |
| "step": 4080 |
| }, |
| { |
| "epoch": 4.823598820058997, |
| "grad_norm": 2.796875, |
| "learning_rate": 1.68645178197065e-06, |
| "loss": 0.4747, |
| "mean_token_accuracy": 0.8598879009485245, |
| "num_tokens": 8699699.0, |
| "step": 4090 |
| }, |
| { |
| "epoch": 4.8353982300884955, |
| "grad_norm": 3.15625, |
| "learning_rate": 1.6851415094339622e-06, |
| "loss": 0.524, |
| "mean_token_accuracy": 0.8518863573670388, |
| "num_tokens": 8720866.0, |
| "step": 4100 |
| }, |
| { |
| "epoch": 4.847197640117994, |
| "grad_norm": 3.453125, |
| "learning_rate": 1.6838312368972744e-06, |
| "loss": 0.4444, |
| "mean_token_accuracy": 0.8683850839734077, |
| "num_tokens": 8741995.0, |
| "step": 4110 |
| }, |
| { |
| "epoch": 4.858997050147493, |
| "grad_norm": 2.875, |
| "learning_rate": 1.6825209643605868e-06, |
| "loss": 0.5355, |
| "mean_token_accuracy": 0.8503773123025894, |
| "num_tokens": 8763684.0, |
| "step": 4120 |
| }, |
| { |
| "epoch": 4.870796460176991, |
| "grad_norm": 2.453125, |
| "learning_rate": 1.6812106918238992e-06, |
| "loss": 0.4867, |
| "mean_token_accuracy": 0.8581328928470612, |
| "num_tokens": 8783920.0, |
| "step": 4130 |
| }, |
| { |
| "epoch": 4.882595870206489, |
| "grad_norm": 3.21875, |
| "learning_rate": 1.6799004192872117e-06, |
| "loss": 0.4766, |
| "mean_token_accuracy": 0.8638118103146553, |
| "num_tokens": 8804409.0, |
| "step": 4140 |
| }, |
| { |
| "epoch": 4.8943952802359885, |
| "grad_norm": 3.09375, |
| "learning_rate": 1.678590146750524e-06, |
| "loss": 0.4517, |
| "mean_token_accuracy": 0.8670265376567841, |
| "num_tokens": 8825178.0, |
| "step": 4150 |
| }, |
| { |
| "epoch": 4.906194690265487, |
| "grad_norm": 2.71875, |
| "learning_rate": 1.6772798742138363e-06, |
| "loss": 0.5043, |
| "mean_token_accuracy": 0.8601580545306206, |
| "num_tokens": 8845609.0, |
| "step": 4160 |
| }, |
| { |
| "epoch": 4.917994100294985, |
| "grad_norm": 2.78125, |
| "learning_rate": 1.6759696016771487e-06, |
| "loss": 0.4743, |
| "mean_token_accuracy": 0.8572592064738274, |
| "num_tokens": 8865960.0, |
| "step": 4170 |
| }, |
| { |
| "epoch": 4.929793510324484, |
| "grad_norm": 3.203125, |
| "learning_rate": 1.6746593291404611e-06, |
| "loss": 0.5298, |
| "mean_token_accuracy": 0.856391716003418, |
| "num_tokens": 8886304.0, |
| "step": 4180 |
| }, |
| { |
| "epoch": 4.941592920353982, |
| "grad_norm": 3.078125, |
| "learning_rate": 1.6733490566037736e-06, |
| "loss": 0.496, |
| "mean_token_accuracy": 0.8596863955259323, |
| "num_tokens": 8906482.0, |
| "step": 4190 |
| }, |
| { |
| "epoch": 4.953392330383481, |
| "grad_norm": 3.03125, |
| "learning_rate": 1.672038784067086e-06, |
| "loss": 0.4952, |
| "mean_token_accuracy": 0.8551343813538551, |
| "num_tokens": 8927802.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 4.96519174041298, |
| "grad_norm": 2.703125, |
| "learning_rate": 1.6707285115303982e-06, |
| "loss": 0.4547, |
| "mean_token_accuracy": 0.8705837085843087, |
| "num_tokens": 8947991.0, |
| "step": 4210 |
| }, |
| { |
| "epoch": 4.976991150442478, |
| "grad_norm": 3.171875, |
| "learning_rate": 1.6694182389937106e-06, |
| "loss": 0.4981, |
| "mean_token_accuracy": 0.8570305466651916, |
| "num_tokens": 8968209.0, |
| "step": 4220 |
| }, |
| { |
| "epoch": 4.988790560471976, |
| "grad_norm": 3.328125, |
| "learning_rate": 1.668107966457023e-06, |
| "loss": 0.5232, |
| "mean_token_accuracy": 0.8568624272942543, |
| "num_tokens": 8989592.0, |
| "step": 4230 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 16960, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 4235, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.523563792352051e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|